Пример #1
0
def createMafDatabase(syn,
                      databaseToSynIdMappingDf,
                      testing=False,
                      staging=False):
    mafDatabaseSynId = process_functions.getDatabaseSynId(
        syn, "vcf2maf", databaseToSynIdMappingDf=databaseToSynIdMappingDf)
    mafDatabaseEnt = syn.get(mafDatabaseSynId)
    mafCols = list(syn.getTableColumns(mafDatabaseSynId))
    schema = synapseclient.Schema(
        name='Narrow MAF %s Database' % time.time(),
        columns=mafCols,
        parent=process_functions.getDatabaseSynId(
            syn, "main", databaseToSynIdMappingDf=databaseToSynIdMappingDf))
    schema.primaryKey = mafDatabaseEnt.primaryKey
    newMafDb = syn.store(schema)
    #Store in the new database synid
    databaseToSynIdMappingDf['Id'][0] = newMafDb.id
    syn.store(
        synapseclient.Table(
            process_functions.getDatabaseSynId(syn, "dbMapping", test=testing),
            databaseToSynIdMappingDf))
    if not staging and not testing:
        #Make sure to store the newly created maf db synid into the staging synapse mapping
        databaseToSynIdMapping = syn.tableQuery(
            "SELECT * FROM syn12094210 where Database = 'vcf2maf'")
        databaseToSynIdMappingDf = databaseToSynIdMapping.asDataFrame()
        databaseToSynIdMappingDf['Id'][0] = newMafDb.id
        syn.store(synapseclient.Table("syn12094210", databaseToSynIdMappingDf))
    #Move and archive old mafdatabase
    mafDatabaseEnt.parentId = "syn7208886"
    mafDatabaseEnt.name = "ARCHIVED " + mafDatabaseEnt.name
    syn.store(mafDatabaseEnt)
    mafDatabaseSynId = newMafDb.id
    #Remove can download permissions from project GENIE team
    syn.setPermissions(mafDatabaseSynId, 3326313, [])
Пример #2
0
    def _validate(self, mutationInCisDf, project_id):
        databaseToSynIdMappingDf = process_functions.get_synid_database_mappingdf(
            self.syn, project_id)
        mutationInCisSynId = process_functions.getDatabaseSynId(
            self.syn,
            "mutationsInCis",
            databaseToSynIdMappingDf=databaseToSynIdMappingDf,
        )
        # Pull down the correct database
        existingMergeCheck = self.syn.tableQuery(
            "select * from {} where Center = '{}'".format(
                mutationInCisSynId, self.center))
        existingMergeCheckDf = existingMergeCheck.asDataFrame()

        total_error = ""
        warning = ""
        required_headers = pd.Series([
            "Flag",
            "Center",
            "Tumor_Sample_Barcode",
            "Hugo_Symbol",
            "HGVSp_Short",
            "Variant_Classification",
            "Chromosome",
            "Start_Position",
            "Reference_Allele",
            "Tumor_Seq_Allele2",
            "t_alt_count_num",
            "t_depth",
        ])
        primaryKeys = [
            "Tumor_Sample_Barcode",
            "HGVSp_Short",
            "Start_Position",
            "Reference_Allele",
            "Tumor_Seq_Allele2",
        ]
        if not all(required_headers.isin(mutationInCisDf.columns)):
            missing_headers = required_headers[~required_headers.
                                               isin(mutationInCisDf.columns)]
            total_error += ("Mutations In Cis Filter File: "
                            "Must at least have these headers: %s.\n" %
                            ",".join(missing_headers))
        else:
            new = mutationInCisDf[primaryKeys].fillna("")
            existing = existingMergeCheckDf[primaryKeys].fillna("")

            existing["primaryAll"] = [
                " ".join(values.astype(str))
                for i, values in existing.iterrows()
            ]
            new["primaryAll"] = [
                " ".join(values.astype(str)) for i, values in new.iterrows()
            ]
            if not all(new.primaryAll.isin(existing.primaryAll)):
                total_error += ("Mutations In Cis Filter File: "
                                "All variants must come from the original "
                                "mutationInCis_filtered_samples.csv file in "
                                "each institution's staging folder.\n")
        return total_error, warning
Пример #3
0
    def _process(self, cnaDf, test=False):
        checkBy = "TUMOR_SAMPLE_BARCODE"

        cnaDf.rename(columns= {cnaDf.columns[0]:cnaDf.columns[0].upper()}, inplace=True)
        cnaDf.rename(columns= {"HUGO_SYMBOL":"Hugo_Symbol"}, inplace=True)

        columns = [col.upper() for col in cnaDf.columns]
        index = [i for i, col in enumerate(cnaDf.columns) if col.upper() == "ENTREZ_GENE_ID"]
        if len(index) > 0:
            del cnaDf[cnaDf.columns[index][0]]
        #validateSymbol = partial(process_functions.validateSymbol,returnMapping=True)
        #invalidated_genes = self.pool.map(validateSymbol, cna["HUGO_SYMBOL"].drop_duplicates())
        #cna, nonmapped = process_functions.remapGenes(invalidated_genes, cna, "HUGO_SYMBOL",isBedFile=True)
        bedSynId = process_functions.getDatabaseSynId(self.syn, "bed", test=test)
        bed = self.syn.tableQuery("select Hugo_Symbol, ID from %s where CENTER = '%s'" % (bedSynId, self.center))
        bedDf = bed.asDataFrame()
        #originalSymbols = cnaDf['HUGO_SYMBOL'].copy()
        cnaDf['Hugo_Symbol'] = cnaDf['Hugo_Symbol'].apply(lambda x: validateSymbol(x, bedDf))
        order = cnaDf.columns
        # unmappable = cnaDf[cnaDf['HUGO_SYMBOL'].isnull()]
        # unmappableSymbols = originalSymbols[cnaDf['HUGO_SYMBOL'].isnull()]

        cnaDf = cnaDf[~cnaDf['Hugo_Symbol'].isnull()]
        #cnaDf = cnaDf.applymap(str)
        duplicatedGenes = pd.DataFrame()
        for i in cnaDf['Hugo_Symbol'][cnaDf['Hugo_Symbol'].duplicated()].unique():
            dups = cnaDf[cnaDf['Hugo_Symbol'] == i]
            newVal = dups[dups.columns[dups.columns!="Hugo_Symbol"]].apply(mergeCNAvalues)
            temp = pd.DataFrame(newVal).transpose()
            temp['Hugo_Symbol'] = i
            duplicatedGenes = duplicatedGenes.append(temp,sort=False)
        cnaDf.drop_duplicates('Hugo_Symbol',keep=False, inplace=True)
        cnaDf = cnaDf.append(duplicatedGenes,sort=False)
        cnaDf = cnaDf[order]
        #symbols = cnaDf['HUGO_SYMBOL']
        #del cnaDf['HUGO_SYMBOL']
        cnaDf.columns = [process_functions.checkGenieId(i,self.center) if i != "Hugo_Symbol" else i for i in cnaDf.columns]
        #Transpose matrix
        # cnaDf = cnaDf.transpose()
        # data = cnaDf.apply(lambda row: makeCNARow(row, symbols), axis=1)

        #Transpose matrix
        # del unmappable['HUGO_SYMBOL']
        # unmappable = unmappable.transpose()
        # unmappableData = unmappable.apply(lambda row: makeCNARow(row, unmappableSymbols), axis=1)

        # newCNA = pd.DataFrame()
        # newCNA[checkBy] = newsamples
        # newCNA['CNAData'] = data.values
        # newCNA['CENTER'] = self.center
        # newCNA['unmappedData'] = unmappableData.values
        #newCNA = newCNA[~newCNA['CNAData'].isnull()]
        #remove the 0.0, 1.0 and 2.0
        # os.system("sed 's/[.]0//g' %s > %s" % (newPath + "temp", newPath))
        # os.remove(newPath + "temp")
        return(cnaDf)
Пример #4
0
def retract(syn, test=False):
    '''
    Main retraction function
    
    params:
        syn: synapse object
        test: use test files or main files. Default is False
    '''

    patientRetract = syn.tableQuery('select * from %s' %
                                    process_functions.getDatabaseSynId(
                                        syn, "patientRetraction", test=test))
    patientRetractIds = patientRetract.asDataFrame()
    #grab all clinical samples that belong to patients in the patient clinical file and append to sample list
    sampleClinical = syn.tableQuery(
        'select * from %s' %
        process_functions.getDatabaseSynId(syn, "sample", test=test))
    sampleClinicalDf = sampleClinical.asDataFrame()
    appendSamples = sampleClinicalDf['SAMPLE_ID'][
        sampleClinicalDf['PATIENT_ID'].isin(patientRetractIds.geniePatientId)]

    sampleRetract = syn.tableQuery(
        'select * from %s' %
        process_functions.getDatabaseSynId(syn, "sampleRetraction", test=test))
    sampleRetractIds = sampleRetract.asDataFrame()

    allRetractedSamples = sampleRetractIds['genieSampleId'].append(
        appendSamples)

    #Only need to retract clinical data, because the rest of the data is filtered by clinical data
    #Sample Clinical Data
    retract_samples(
        syn, process_functions.getDatabaseSynId(syn, "sample", test=test),
        "SAMPLE_ID", allRetractedSamples)
    #Patient Clinical Data
    retract_samples(
        syn, process_functions.getDatabaseSynId(syn, "patient", test=test),
        "PATIENT_ID", patientRetractIds['geniePatientId'])
Пример #5
0
    def _validate(self, fusionDF, noSymbolCheck, testing=False):
        total_error = ""
        warning = ""

        # Frame: "in-frame" or "frameshift".
        # Fusion_Status (OPTIONAL): An assessment of the mutation type (i.e., "SOMATIC", "GERMLINE", "UNKNOWN", or empty)

        fusionDF.columns = [col.upper() for col in fusionDF.columns]

        REQUIRED_HEADERS = pd.Series([
            'HUGO_SYMBOL', 'ENTREZ_GENE_ID', 'CENTER', 'TUMOR_SAMPLE_BARCODE',
            'FUSION', 'DNA_SUPPORT', 'RNA_SUPPORT', 'METHOD', 'FRAME'
        ])
        if fusionDF.get("COMMENTS") is None:
            fusionDF['COMMENTS'] = float('nan')
        if not all(REQUIRED_HEADERS.isin(fusionDF.columns)):
            total_error += "Your fusion file must at least have these headers: %s.\n" % ",".join(
                REQUIRED_HEADERS[~REQUIRED_HEADERS.isin(fusionDF.columns)])
        if process_functions.checkColExist(
                fusionDF, "HUGO_SYMBOL") and not noSymbolCheck:
            # logger.info("VALIDATING %s GENE SYMBOLS" % os.path.basename(filePath))
            #invalidated_genes = fusionDF["HUGO_SYMBOL"].drop_duplicates().apply(validateSymbol)
            bedSynId = process_functions.getDatabaseSynId(self.syn,
                                                          "bed",
                                                          test=testing)
            bed = self.syn.tableQuery(
                "select Hugo_Symbol, ID from %s where CENTER = '%s'" %
                (bedSynId, self.center))
            bedDf = bed.asDataFrame()
            #invalidated_genes = self.pool.map(process_functions.validateSymbol, fusionDF["HUGO_SYMBOL"].drop_duplicates())
            fusionDF = fusionDF.drop_duplicates("HUGO_SYMBOL").apply(
                lambda x: validateSymbol(x, bedDf), axis=1)
            if fusionDF["HUGO_SYMBOL"].isnull().any():
                total_error += "Your fusion file should not have any NA/blank Hugo Symbols.\n"

        # if process_functions.checkColExist(fusionDF, "DNA_SUPPORT"):
        #     if not fusionDF.DNA_SUPPORT.isin(["yes","no","unknown"]).all():
        #         total_error += "Your fusion file's DNA_SUPPORT column must be 'yes', 'no', or 'unknown'"

        # if process_functions.checkColExist(fusionDF, "RNA_SUPPORT"):
        #     if not fusionDF.RNA_SUPPORT.isin(["yes","no","unknown"]).all():
        #         total_error += "Your fusion file's RNA_SUPPORT column must be 'yes', 'no', or 'unknown'"

        # if process_functions.checkColExist(fusionDF, "FRAME"):
        #     if not fusionDF.FRAME.isin(["in-frame","frameshift"]).all():
        #         total_error += "Your fusion file's FRAME column must be 'in-frame', or 'frameshift'"

        return (total_error, warning)
Пример #6
0
    def _process(self, fusion, databaseToSynIdMappingDf):
        fusion.columns = [col.upper() for col in fusion.columns]
        fusion["CENTER"] = self.center
        newsamples = [
            process_functions.checkGenieId(i, self.center)
            for i in fusion["TUMOR_SAMPLE_BARCODE"]
        ]
        fusion["TUMOR_SAMPLE_BARCODE"] = newsamples

        # This is temporary, because comments column will be removed
        # if fusion.get("COMMENTS") is None:
        #    fusion['COMMENTS'] = ""
        # #Will remove comments column
        # fusion['COMMENTS'] = ""
        fusion["ENTREZ_GENE_ID"] = fusion["ENTREZ_GENE_ID"].fillna(0)
        fusion = fusion.drop_duplicates()
        fusion["ID"] = fusion["HUGO_SYMBOL"].copy()
        bedSynId = process_functions.getDatabaseSynId(
            self.syn, "bed", databaseToSynIdMappingDf=databaseToSynIdMappingDf)
        bed = self.syn.tableQuery(
            "select Hugo_Symbol, ID from %s where CENTER = '%s'" %
            (bedSynId, self.center))
        bedDf = bed.asDataFrame()
        fusion = fusion.apply(lambda x: validateSymbol(x, bedDf), axis=1)
        # Create nonmapped gene dict
        temp = fusion[fusion["HUGO_SYMBOL"] != fusion["ID"]]
        foo = temp[~temp.HUGO_SYMBOL.isnull()]
        temp = foo[["HUGO_SYMBOL", "ID"]]
        temp.drop_duplicates(inplace=True)
        temp.index = temp.ID
        del temp["ID"]
        # fusion = fusion[~fusion['HUGO_SYMBOL'].isnull()]
        fusion["FUSION"] = fusion["FUSION"].fillna("")
        fusion, nonmapped = remapFusion(temp.to_dict()["HUGO_SYMBOL"], fusion,
                                        "FUSION")
        # Fill in blank hugo symbol columns with original symbol
        null_symbols_idx = fusion["HUGO_SYMBOL"].isnull()
        fusion["HUGO_SYMBOL"][null_symbols_idx] = fusion["ID"][
            null_symbols_idx]
        # fusion, nonmapped = remapFusion(temp.to_dict()['HUGO_SYMBOL'], fusion, "COMMENTS")
        fusion["ENTREZ_GENE_ID"] = [
            int(float(i)) for i in fusion["ENTREZ_GENE_ID"]
        ]
        return fusion
Пример #7
0
    def _validate(self, mutationInCisDf, testing=False):
        mutationInCisSynId = process_functions.getDatabaseSynId(
            self.syn, "mutationsInCis", test=testing)
        #Pull down the correct database
        existingMergeCheck = self.syn.tableQuery(
            "select * from %s where Center = '%s'" %
            (mutationInCisSynId, self.center))
        existingMergeCheckDf = existingMergeCheck.asDataFrame()

        total_error = ""
        warning = ""
        REQUIRED_HEADERS = pd.Series([
            'Flag', 'Center', 'Tumor_Sample_Barcode', 'Hugo_Symbol',
            'HGVSp_Short', 'Variant_Classification', 'Chromosome',
            'Start_Position', 'Reference_Allele', 'Tumor_Seq_Allele2',
            't_alt_count_num', 't_depth'
        ])
        primaryKeys = [
            'Tumor_Sample_Barcode', 'HGVSp_Short', 'Start_Position',
            'Reference_Allele', 'Tumor_Seq_Allele2'
        ]
        if not all(REQUIRED_HEADERS.isin(mutationInCisDf.columns)):
            total_error += "Mutations In Cis Filter File: Must at least have these headers: %s.\n" % ",".join(
                REQUIRED_HEADERS[~REQUIRED_HEADERS.isin(mutationInCisDf.columns
                                                        )])
        else:
            new = mutationInCisDf[primaryKeys].fillna("")
            existing = existingMergeCheckDf[primaryKeys].fillna("")

            existing['primaryAll'] = [
                " ".join(values.astype(str))
                for i, values in existing.iterrows()
            ]
            new['primaryAll'] = [
                " ".join(values.astype(str)) for i, values in new.iterrows()
            ]
            if not all(new.primaryAll.isin(existing.primaryAll)):
                total_error += "Mutations In Cis Filter File: All variants must come from the original mutationInCis_filtered_samples.csv file in each institution's staging folder.\n"
        return (total_error, warning)
Пример #8
0
    def _validate(self, cnvDF, noSymbolCheck, testing=False):
        total_error = ""
        warning = ""
        cnvDF.columns = [col.upper() for col in cnvDF.columns]

        if cnvDF.columns[0] != "HUGO_SYMBOL":
            total_error += "Your cnv file's first column must be Hugo_Symbol\n"
        haveColumn = process_functions.checkColExist(cnvDF, "HUGO_SYMBOL")
        if haveColumn:
            keepSymbols = cnvDF["HUGO_SYMBOL"]
            cnvDF.drop("HUGO_SYMBOL", axis=1, inplace=True)

        # if sum(cnvDF.apply(lambda x: sum(x.isnull()))) > 0:
        #   total_error += "Your cnv file must not have any empty values\n"

        if process_functions.checkColExist(cnvDF, "ENTREZ_GENE_ID"):
            del cnvDF['ENTREZ_GENE_ID']
        
        #cnvDF = cnvDF.fillna('')
        if not all(cnvDF.applymap(lambda x: str(x) in ['-2.0','-2','-1.5','-1.0','-1','0.0','0','0.5','1.0','1','1.5','2','2.0','nan']).all()):
            total_error += "All values must be NA/blank, -2, -1.5, -1, -0.5, 0, 0.5, 1, 1.5, or 2.\n"
        else:
            cnvDF['HUGO_SYMBOL'] = keepSymbols
            if haveColumn and not noSymbolCheck:
                #logger.info("VALIDATING %s GENE SYMBOLS" % os.path.basename(filePath))

                bedSynId = process_functions.getDatabaseSynId(self.syn, "bed", test=testing)
                bed = self.syn.tableQuery("select Hugo_Symbol, ID from %s where CENTER = '%s'" % (bedSynId, self.center))
                bedDf = bed.asDataFrame()
                cnvDF['remapped'] = cnvDF['HUGO_SYMBOL'].apply(lambda x: validateSymbol(x, bedDf))
                cnvDF = cnvDF[~cnvDF['remapped'].isnull()]

                #Do not allow any duplicated genes after symbols have been remapped
                if sum(cnvDF['remapped'].duplicated()) >0:
                    total_error+= "Your CNA file has duplicated Hugo_Symbols (After remapping of genes): %s -> %s.\n" % (",".join(cnvDF['HUGO_SYMBOL'][cnvDF['remapped'].duplicated(keep=False)]), ",".join(cnvDF['remapped'][cnvDF['remapped'].duplicated(keep=False)]))
        return(total_error, warning)
Пример #9
0
def main():
    """Set up argument parser and returns"""
    parser = argparse.ArgumentParser(
        description='GENIE center inputs to database')
    parser.add_argument("process",
                        choices=['vcf', 'maf', 'main', 'mafSP'],
                        help='Process vcf, maf or the rest of the files')
    parser.add_argument('--center', help='The centers')
    parser.add_argument("--pemFile",
                        type=str,
                        help="Path to PEM file (genie.pem)")
    parser.add_argument("--deleteOld",
                        action='store_true',
                        help="Delete all old processed and temp files")
    parser.add_argument("--onlyValidate",
                        action='store_true',
                        help="Only validate the files, don't process")
    parser.add_argument("--oncotreeLink",
                        type=str,
                        help="Link to oncotree code")
    parser.add_argument("--createNewMafDatabase",
                        action='store_true',
                        help="Creates a new maf database")
    parser.add_argument("--testing",
                        action='store_true',
                        help="Testing the infrastructure!")
    parser.add_argument("--debug",
                        action='store_true',
                        help="Add debug mode to synapse")
    parser.add_argument("--reference",
                        type=str,
                        help="Path to VCF reference file")

    #DEFAULT PARAMS
    parser.add_argument("--vcf2mafPath",
                        type=str,
                        help="Path to vcf2maf",
                        default="~/vcf2maf-1.6.14")
    parser.add_argument("--vepPath",
                        type=str,
                        help="Path to VEP",
                        default="~/vep")
    parser.add_argument("--vepData",
                        type=str,
                        help="Path to VEP data",
                        default="~/.vep")
    parser.add_argument('--thread',
                        type=int,
                        help="Number of threads to use for validation",
                        default=1)

    args = parser.parse_args()
    syn = process_functions.synLogin(args.pemFile, debug=args.debug)
    #Must specify path to vcf2maf, VEP and VEP data is these types are specified
    if args.process in ['vcf', 'maf', 'mafSP'] and not args.onlyValidate:
        assert os.path.exists(
            args.vcf2mafPath
        ), "Path to vcf2maf (--vcf2mafPath) must be specified if `--process {vcf,maf,mafSP}` is used"
        assert os.path.exists(
            args.vepPath
        ), "Path to VEP (--vepPath) must be specified if `--process {vcf,maf,mafSP}` is used"
        assert os.path.exists(
            args.vepData
        ), "Path to VEP data (--vepData) must be specified if `--process {vcf,maf,mafSP}` is used"

    if args.testing:
        databaseToSynIdMapping = syn.tableQuery('SELECT * FROM syn11600968')
    else:
        databaseToSynIdMapping = syn.tableQuery('SELECT * FROM syn10967259')

    databaseToSynIdMappingDf = databaseToSynIdMapping.asDataFrame()

    center_mapping_id = process_functions.getDatabaseSynId(
        syn,
        "centerMapping",
        databaseToSynIdMappingDf=databaseToSynIdMappingDf)
    center_mapping = syn.tableQuery('SELECT * FROM %s' % center_mapping_id)
    center_mapping_df = center_mapping.asDataFrame()

    if args.center is not None:
        assert args.center in center_mapping_df.center.tolist(
        ), "Must specify one of these centers: %s" % ", ".join(
            center_mapping_df.center)
        centers = [args.center]
    else:
        center_mapping_df = center_mapping_df[~center_mapping_df['inputSynId'].
                                              isnull()]
        center_mapping_df = center_mapping_df[center_mapping_df['release'] ==
                                              True]
        centers = center_mapping_df.center

    if args.oncotreeLink is None:
        onco_link = databaseToSynIdMappingDf['Id'][
            databaseToSynIdMappingDf['Database'] == 'oncotreeLink'].values[0]
        onco_link_ent = syn.get(onco_link)
        args.oncotreeLink = onco_link_ent.externalURL
    #Check if you can connect to oncotree link, if not then don't run validation / processing
    process_functions.checkUrl(args.oncotreeLink)

    center_mapping_ent = syn.get(center_mapping_id)
    if center_mapping_ent.get('isProcessing', ['True'])[0] == 'True':
        raise Exception(
            "Processing/validation is currently happening.  Please change/add the 'isProcessing' annotation on %s to False to enable processing"
            % center_mapping_id)
    else:
        center_mapping_ent.isProcessing = "True"
        center_mapping_ent = syn.store(center_mapping_ent)
    #remove this query timeout and see what happens
    #syn.table_query_timeout = 50000

    #Create new maf database, should only happen once if its specified
    if args.createNewMafDatabase:
        createMafDatabase(syn, databaseToSynIdMappingDf, testing=args.testing)

    for center in centers:
        input_to_database(syn,
                          center,
                          args.process,
                          args.testing,
                          args.onlyValidate,
                          args.vcf2mafPath,
                          args.vepPath,
                          args.vepData,
                          databaseToSynIdMappingDf,
                          center_mapping_df,
                          reference=args.reference,
                          delete_old=args.deleteOld,
                          oncotree_link=args.oncotreeLink,
                          thread=args.thread)

    # To ensure that this is the new entity
    center_mapping_ent = syn.get(center_mapping_id)
    center_mapping_ent.isProcessing = "False"
    center_mapping_ent = syn.store(center_mapping_ent)

    error_tracker_synid = process_functions.getDatabaseSynId(
        syn, "errorTracker", databaseToSynIdMappingDf=databaseToSynIdMappingDf)
    #Only write out invalid reasons if the center isnt specified and if only validate
    if args.center is None and args.onlyValidate:
        logging.info("WRITING INVALID REASONS TO CENTER STAGING DIRS")
        write_invalid_reasons.write_invalid_reasons(syn, center_mapping_df,
                                                    error_tracker_synid)
Пример #10
0
def input_to_database(syn,
                      center,
                      process,
                      testing,
                      only_validate,
                      vcf2maf_path,
                      vep_path,
                      vep_data,
                      database_to_synid_mappingdf,
                      center_mapping_df,
                      reference=None,
                      delete_old=False,
                      oncotree_link=None,
                      thread=1):
    if only_validate:
        log_path = os.path.join(process_functions.SCRIPT_DIR,
                                "%s_validation_log.txt" % center)
    else:
        log_path = os.path.join(process_functions.SCRIPT_DIR,
                                "%s_%s_log.txt" % (center, process))

    logFormatter = logging.Formatter(
        "%(asctime)s [%(name)s][%(levelname)s] %(message)s")
    fileHandler = logging.FileHandler(log_path, mode='w')
    fileHandler.setFormatter(logFormatter)
    logger.addHandler(fileHandler)

    if testing:
        logger.info("###########################################")
        logger.info("############NOW IN TESTING MODE############")
        logger.info("###########################################")

    # ----------------------------------------
    # Start input to staging process
    # ----------------------------------------

    #path_to_genie = os.path.realpath(os.path.join(process_functions.SCRIPT_DIR,"../"))
    #Make the synapsecache dir the genie input folder for now
    #The main reason for this is because the .synaspecache dir is mounted by batch
    path_to_genie = os.path.expanduser("~/.synapseCache")
    #Create input and staging folders
    if not os.path.exists(os.path.join(path_to_genie, center, "input")):
        os.makedirs(os.path.join(path_to_genie, center, "input"))
    if not os.path.exists(os.path.join(path_to_genie, center, "staging")):
        os.makedirs(os.path.join(path_to_genie, center, "staging"))

    if delete_old:
        process_functions.rmFiles(os.path.join(path_to_genie, center))

    validFiles = validation(syn, center, process, center_mapping_df,
                            database_to_synid_mappingdf, thread, testing,
                            oncotree_link)

    if len(validFiles) > 0 and not only_validate:
        #Reorganize so BED file are always validated and processed first
        validBED = [
            os.path.basename(i).endswith('.bed') for i in validFiles['path']
        ]
        beds = validFiles[validBED]
        validFiles = beds.append(validFiles)
        validFiles.drop_duplicates(inplace=True)
        #Valid maf, mafsp, vcf and cbs files
        validMAF = [
            i for i in validFiles['path']
            if os.path.basename(i) == "data_mutations_extended_%s.txt" % center
        ]
        validMAFSP = [
            i for i in validFiles['path'] if os.path.basename(i) ==
            "nonGENIE_data_mutations_extended_%s.txt" % center
        ]
        validVCF = [
            i for i in validFiles['path']
            if os.path.basename(i).endswith('.vcf')
        ]
        #validCBS = [i for i in validFiles['path'] if os.path.basename(i).endswith('.cbs')]
        if process == 'mafSP':
            validMAFs = validMAFSP
        else:
            validMAFs = validMAF

        processTrackerSynId = process_functions.getDatabaseSynId(
            syn,
            "processTracker",
            databaseToSynIdMappingDf=database_to_synid_mappingdf)
        #Add process tracker for time start
        processTracker = syn.tableQuery(
            "SELECT timeStartProcessing FROM %s where center = '%s' and processingType = '%s'"
            % (processTrackerSynId, center, process))
        processTrackerDf = processTracker.asDataFrame()
        if len(processTrackerDf) == 0:
            new_rows = [[
                center,
                str(int(time.time() * 1000)),
                str(int(time.time() * 1000)), process
            ]]
            table = syn.store(
                synapseclient.Table(processTrackerSynId, new_rows))
        else:
            processTrackerDf['timeStartProcessing'][0] = str(
                int(time.time() * 1000))
            syn.store(
                synapseclient.Table(processTrackerSynId, processTrackerDf))

        processFiles(syn,
                     validFiles,
                     center,
                     path_to_genie,
                     thread,
                     center_mapping_df,
                     oncotree_link,
                     database_to_synid_mappingdf,
                     validVCF=validVCF,
                     validMAFs=validMAFs,
                     vcf2mafPath=vcf2maf_path,
                     veppath=vep_path,
                     vepdata=vep_data,
                     test=testing,
                     processing=process,
                     reference=reference)

        #Should add in this process end tracking before the deletion of samples
        processTracker = syn.tableQuery(
            "SELECT timeEndProcessing FROM %s where center = '%s' and processingType = '%s'"
            % (processTrackerSynId, center, process))
        processTrackerDf = processTracker.asDataFrame()
        processTrackerDf['timeEndProcessing'][0] = str(int(time.time() * 1000))
        syn.store(synapseclient.Table(processTrackerSynId, processTrackerDf))

        logger.info("SAMPLE/PATIENT RETRACTION")
        toRetract.retract(syn, testing)

    else:
        messageOut = "%s does not have any valid files" if not only_validate else "ONLY VALIDATION OCCURED FOR %s"
        logger.info(messageOut % center)

    #Store log file
    syn.store(synapseclient.File(log_path, parentId="syn10155804"))
    os.remove(log_path)
    logger.info("ALL PROCESSES COMPLETE")
Пример #11
0
def validation(syn, center, process, center_mapping_df,
               databaseToSynIdMappingDf, thread, testing, oncotreeLink):
    centerInputSynId = center_mapping_df['inputSynId'][
        center_mapping_df['center'] == center][0]
    logger.info("Center: " + center)
    allFiles = getCenterInputFiles(syn, centerInputSynId, center, process)

    allFiles = pd.DataFrame(allFiles, columns=['synId', 'filePaths'])
    #If a center has no files, then return empty list
    if allFiles.empty:
        logger.info("%s has not uploaded any files" % center)
        return ([])
    else:
        #Make sure the vcf validation statuses don't get wiped away
        if process != "vcf":
            addToQuery = "and name not like '%.vcf'"
        else:
            addToQuery = ''
        validationStatus = syn.tableQuery(
            "SELECT * FROM %s where center = '%s' %s" %
            (process_functions.getDatabaseSynId(
                syn,
                "validationStatus",
                databaseToSynIdMappingDf=databaseToSynIdMappingDf), center,
             addToQuery))
        errorTracker = syn.tableQuery(
            "SELECT * FROM %s where center = '%s' %s" %
            (process_functions.getDatabaseSynId(
                syn,
                "errorTracker",
                databaseToSynIdMappingDf=databaseToSynIdMappingDf), center,
             addToQuery))
        #VALIDATE FILES
        validationStatusDf = validationStatus.asDataFrame()
        errorTrackerDf = errorTracker.asDataFrame()
        validated = allFiles.apply(
            lambda x: validateFile(syn, validationStatusDf, errorTrackerDf,
                                   center, thread, x, testing, oncotreeLink),
            axis=1)
        inputValidStatus = []
        invalidErrors = []
        for inputStat, invalErrors in validated:
            inputValidStatus.extend(inputStat)
            if invalErrors is not None:
                invalidErrors.extend(invalErrors)
        inputValidStatus = pd.DataFrame(inputValidStatus,
                                        columns=[
                                            "id", 'path', 'md5', 'status',
                                            'name', 'modifiedOn', 'fileType'
                                        ])
        logger.info("CHECK FOR DUPLICATED FILES")
        ##### DUPLICATED FILES ######
        #check for duplicated filenames.  There should be no duplication, files should be uploaded as new versions and the entire dataset should be uploaded everytime
        #cbs and seg files should not be duplicated.  There can only be one
        duplicatedFiles = inputValidStatus[inputValidStatus['name'].duplicated(
            keep=False)]
        cbsSegBool = [
            os.path.basename(i).endswith('.cbs')
            or os.path.basename(i).endswith('.seg')
            for i in inputValidStatus['name']
        ]
        cbsSegFiles = inputValidStatus[cbsSegBool]
        if len(cbsSegFiles) > 1:
            duplicatedFiles = duplicatedFiles.append(cbsSegFiles)
        # nodups = ["data_mutations_extended"]
        # allDuplicatedFiles = []
        # for nodup in nodups:
        #   checkDups = [name for name in inputValidStatus['name'] if name.startswith(nodup)]
        #   if len(checkDups) > 1:
        #       allDuplicatedFiles.extend(checkDups)
        # duplicatedFiles = duplicatedFiles.append(inputValidStatus[inputValidStatus['name'].isin(allDuplicatedFiles)])

        duplicatedFiles.drop_duplicates("id", inplace=True)
        inputValidStatus['status'][inputValidStatus['id'].isin(
            duplicatedFiles['id'])] = "INVALID"
        duplicatedFiles[
            'errors'] = "DUPLICATED FILENAME! FILES SHOULD BE UPLOADED AS NEW VERSIONS AND THE ENTIRE DATASET SHOULD BE UPLOADED EVERYTIME"
        #Send an email if there are any duplicated files
        if not duplicatedFiles.empty:
            incorrectFiles = ", ".join([
                name for synId, name in zip(duplicatedFiles['id'],
                                            duplicatedFiles['name'])
            ])
            incorrectEnt = syn.get(duplicatedFiles['id'].iloc[0])
            sendEmail = set([incorrectEnt.modifiedBy, incorrectEnt.createdBy])
            userNames = ", ".join(
                [syn.getUserProfile(user).userName for user in sendEmail])
            syn.sendMessage(
                list(sendEmail), "GENIE Validation Error",
                "Dear %s,\n\nYour files (%s) are duplicated!  FILES SHOULD BE UPLOADED AS NEW VERSIONS AND THE ENTIRE DATASET SHOULD BE UPLOADED EVERYTIME"
                % (userNames, incorrectFiles))
        logger.info("THERE ARE %d DUPLICATED FILES" % len(duplicatedFiles))
        ##### DUPLICATED FILES ######

        #Create invalid error synapse table
        logger.info("UPDATE INVALID FILE REASON DATABASE")
        invalidErrors = pd.DataFrame(invalidErrors,
                                     columns=["id", 'errors', 'name'])
        # Remove fixed duplicated files
        dupIds = invalidErrors['id'][
            invalidErrors['errors'] ==
            "DUPLICATED FILENAME! FILES SHOULD BE UPLOADED AS NEW VERSIONS AND THE ENTIRE DATASET SHOULD BE UPLOADED EVERYTIME"]
        removeIds = dupIds[~dupIds.isin(duplicatedFiles['id'])]
        invalidErrors = invalidErrors[~invalidErrors['id'].isin(removeIds)]
        # Append duplicated file errors
        invalidErrors = invalidErrors.append(
            duplicatedFiles[['id', 'errors', 'name']])
        invalidErrors['center'] = center
        invalidIds = inputValidStatus['id'][inputValidStatus['status'] ==
                                            "INVALID"]
        invalidErrors = invalidErrors[invalidErrors['id'].isin(invalidIds)]
        process_functions.updateDatabase(
            syn,
            errorTracker.asDataFrame(),
            invalidErrors,
            process_functions.getDatabaseSynId(
                syn,
                "errorTracker",
                databaseToSynIdMappingDf=databaseToSynIdMappingDf), ["id"],
            toDelete=True)

        paths = inputValidStatus['path']
        filenames = [os.path.basename(name) for name in paths]
        del inputValidStatus['path']
        logger.info("UPDATE VALIDATION STATUS DATABASE")
        inputValidStatus['center'] = center
        #Remove fixed duplicated files
        inputValidStatus = inputValidStatus[~inputValidStatus['id'].
                                            isin(removeIds)]

        process_functions.updateDatabase(
            syn,
            validationStatus.asDataFrame(),
            inputValidStatus[[
                "id", 'md5', 'status', 'name', 'center', 'modifiedOn'
            ]],
            process_functions.getDatabaseSynId(
                syn,
                "validationStatus",
                databaseToSynIdMappingDf=databaseToSynIdMappingDf), ["id"],
            toDelete=True)
        inputValidStatus['path'] = paths
        validFiles = inputValidStatus[[
            'id', 'path', 'fileType'
        ]][inputValidStatus['status'] == "VALIDATED"]
        return (validFiles)
Пример #12
0
    def _validate(self, assay_info_df, project_id):
        """
        Validates the values of assay information file

        Args:
            assay_info_df: assay information dataframe

        Returns:
            tuple: error and warning
        """

        total_error = ""
        warning = ""

        if process_functions.checkColExist(assay_info_df, "SEQ_ASSAY_ID"):
            all_seq_assays = (assay_info_df.SEQ_ASSAY_ID.replace(
                {
                    "_": "-"
                }, regex=True).str.upper().unique())
            if not all(
                [assay.startswith(self.center) for assay in all_seq_assays]):
                total_error += (
                    "Assay_information.yaml: Please make sure all your "
                    "SEQ_ASSAY_IDs start with your center abbreviation.\n")
            db_to_syn_map_df = process_functions.get_synid_database_mappingdf(
                self.syn, project_id)
            sample_synid = process_functions.getDatabaseSynId(
                self.syn, "sample", databaseToSynIdMappingDf=db_to_syn_map_df)
            uniq_seq_df = process_functions.get_syntabledf(
                self.syn,
                f"select distinct(SEQ_ASSAY_ID) as seq from {sample_synid} "
                f"where CENTER = '{self.center}'",
            )
            # These are all the SEQ_ASSAY_IDs that are in the clinical database
            # but not in the assay_information file
            missing_seqs = uniq_seq_df["seq"][
                ~uniq_seq_df["seq"].replace({
                    "_": "-"
                }, regex=True).str.upper().isin(all_seq_assays)]
            missing_seqs_str = ", ".join(missing_seqs)
            if missing_seqs.to_list():
                total_error += (
                    "Assay_information.yaml: You are missing SEQ_ASSAY_IDs: "
                    f"{missing_seqs_str}\n")

        else:
            total_error += "Assay_information.yaml: Must have SEQ_ASSAY_ID column.\n"

        read_group_dict = process_functions.get_gdc_data_dictionary(
            "read_group")
        read_group_headers = read_group_dict["properties"]
        warn, error = process_functions.check_col_and_values(
            assay_info_df,
            "is_paired_end",
            [True, False],
            filename="Assay_information.yaml",
            required=True,
        )
        warning += warn
        total_error += error

        warn, error = process_functions.check_col_and_values(
            assay_info_df,
            "library_selection",
            read_group_headers["library_selection"]["enum"],
            filename="Assay_information.yaml",
            required=True,
        )
        warning += warn
        total_error += error

        warn, error = process_functions.check_col_and_values(
            assay_info_df,
            "library_strategy",
            read_group_headers["library_strategy"]["enum"],
            filename="Assay_information.yaml",
            required=True,
        )
        warning += warn
        total_error += error

        warn, error = process_functions.check_col_and_values(
            assay_info_df,
            "platform",
            read_group_headers["platform"]["enum"],
            filename="Assay_information.yaml",
            required=True,
        )
        warning += warn
        total_error += error

        instrument_model = read_group_headers["instrument_model"]["enum"]
        instrument_model.extend(["Illumina NovaSeq 6000", None])
        warn, error = process_functions.check_col_and_values(
            assay_info_df,
            "instrument_model",
            instrument_model,
            filename="Assay_information.yaml",
            required=True,
        )
        warning += warn
        total_error += error

        # target_capture_kit = read_group_headers['target_capture_kit']['enum']
        # warn, error = process_functions.check_col_and_values(
        #     assay_info_df,
        #     'target_capture_kit',
        #     target_capture_kit,
        #     filename="Assay_information.yaml",
        #     required=True)
        # warning += warn
        # total_error += error

        if not process_functions.checkColExist(assay_info_df,
                                               "target_capture_kit"):
            total_error += ("Assay_information.yaml: "
                            "Must have target_capture_kit column.\n")

        variant_classes = [
            "Splice_Site",
            "Nonsense_Mutation",
            "Frame_Shift_Del",
            "Frame_Shift_Ins",
            "Nonstop_Mutation",
            "Translation_Start_Site",
            "In_Frame_Ins",
            "In_Frame_Del",
            "Missense_Mutation",
            "Intron",
            "Splice_Region",
            "Silent",
            "RNA",
            "5'UTR",
            "3'UTR",
            "IGR",
            "5'Flank",
            "3'Flank",
            None,
        ]
        warn, error = process_functions.check_col_and_values(
            assay_info_df,
            "variant_classifications",
            variant_classes,
            filename="Assay_information.yaml",
            na_allowed=True,
            sep=";",
        )
        warning += warn
        total_error += error

        if process_functions.checkColExist(assay_info_df, "read_length"):
            if not all([
                    process_functions.checkInt(i)
                    for i in assay_info_df["read_length"]
                    if i is not None and not pd.isnull(i)
            ]):
                total_error += ("Assay_information.yaml: "
                                "Please double check your read_length.  "
                                "It must be an integer or null.\n")
        else:
            total_error += "Assay_information.yaml: " "Must have read_length column.\n"

        if process_functions.checkColExist(assay_info_df, "number_of_genes"):
            if not all([
                    process_functions.checkInt(i)
                    for i in assay_info_df["number_of_genes"]
            ]):
                total_error += ("Assay_information.yaml: "
                                "Please double check your number_of_genes. "
                                "It must be an integer.\n")
        else:
            total_error += ("Assay_information.yaml: "
                            "Must have number_of_genes column.\n")

        if process_functions.checkColExist(assay_info_df, "gene_padding"):
            if not all([
                    process_functions.checkInt(i)
                    for i in assay_info_df["gene_padding"]
                    if i is not None and not pd.isnull(i)
            ]):
                total_error += ("Assay_information.yaml: "
                                "Please double check your gene_padding. "
                                "It must be an integer or blank.\n")
        else:
            warning += ("Assay_information.yaml: "
                        "gene_padding is by default 10 if not specified.\n")

        warn, error = process_functions.check_col_and_values(
            assay_info_df,
            "calling_strategy",
            ["tumor_only", "tumor_normal", "plasma_normal"],
            filename="Assay_information.yaml",
            required=True,
        )
        warning += warn
        total_error += error

        if process_functions.checkColExist(assay_info_df,
                                           "specimen_tumor_cellularity"):
            if not all([
                    i.startswith(">") and i.endswith("%")
                    for i in assay_info_df["specimen_tumor_cellularity"]
            ]):
                total_error += (
                    "Assay_information.yaml: "
                    "Please double check your specimen_tumor_cellularity. "
                    "It must in this format >(num)%. ie. >10%\n")
        else:
            total_error += ("Assay_information.yaml: "
                            "Must have specimen_tumor_cellularity column.\n")

        alteration_types = [
            "snv",
            "small_indels",
            "gene_level_cna",
            "intragenic_cna",
            "structural_variants",
        ]
        warn, error = process_functions.check_col_and_values(
            assay_info_df,
            "alteration_types",
            alteration_types,
            filename="Assay_information.yaml",
            required=True,
            sep=";",
        )
        warning += warn
        total_error += error

        preservation_technique = ["FFPE", "fresh_frozen", "NA"]
        warn, error = process_functions.check_col_and_values(
            assay_info_df,
            "preservation_technique",
            preservation_technique,
            filename="Assay_information.yaml",
            required=True,
            sep=";",
        )
        warning += warn
        total_error += error

        coverage = ["hotspot_regions", "coding_exons", "introns", "promoters"]
        warn, error = process_functions.check_col_and_values(
            assay_info_df,
            "coverage",
            coverage,
            filename="Assay_information.yaml",
            required=True,
            sep=";",
        )
        warning += warn
        total_error += error

        return total_error, warning
Пример #13
0
    def _validate(self, cnvDF, nosymbol_check, project_id):
        total_error = ""
        warning = ""
        cnvDF.columns = [col.upper() for col in cnvDF.columns]

        if cnvDF.columns[0] != "HUGO_SYMBOL":
            total_error += "Your cnv file's first column must be Hugo_Symbol\n"
        haveColumn = process_functions.checkColExist(cnvDF, "HUGO_SYMBOL")
        if haveColumn:
            keepSymbols = cnvDF["HUGO_SYMBOL"]
            cnvDF.drop("HUGO_SYMBOL", axis=1, inplace=True)

        # if sum(cnvDF.apply(lambda x: sum(x.isnull()))) > 0:
        #   total_error += "Your cnv file must not have any empty values\n"

        if process_functions.checkColExist(cnvDF, "ENTREZ_GENE_ID"):
            del cnvDF["ENTREZ_GENE_ID"]

        # cnvDF = cnvDF.fillna('')
        allowed_values = [
            "-2.0",
            "-2",
            "-1.5",
            "-1.0",
            "-1",
            "0.0",
            "0",
            "0.5",
            "1.0",
            "1",
            "1.5",
            "2",
            "2.0",
            "nan",
        ]
        if not all(cnvDF.applymap(lambda x: str(x) in allowed_values).all()):
            total_error += ("All values must be NA/blank, -2, -1.5, -1, -0.5, "
                            "0, 0.5, 1, 1.5, or 2.\n")
        else:
            cnvDF["HUGO_SYMBOL"] = keepSymbols
            if haveColumn and not nosymbol_check:
                databaseToSynIdMappingDf = (
                    process_functions.get_synid_database_mappingdf(
                        self.syn, project_id))
                bedSynId = process_functions.getDatabaseSynId(
                    self.syn,
                    "bed",
                    databaseToSynIdMappingDf=databaseToSynIdMappingDf)
                bed = self.syn.tableQuery(
                    "select Hugo_Symbol, ID from {} where "
                    "CENTER = '{}'".format(bedSynId, self.center))
                bedDf = bed.asDataFrame()
                cnvDF["remapped"] = cnvDF["HUGO_SYMBOL"].apply(
                    lambda x: validateSymbol(x, bedDf))
                cnvDF = cnvDF[~cnvDF["remapped"].isnull()]

                # Do not allow any duplicated genes after symbols
                # have been remapped
                if sum(cnvDF["remapped"].duplicated()) > 0:
                    duplicated = cnvDF["remapped"].duplicated(keep=False)
                    total_error += (
                        "Your CNA file has duplicated Hugo_Symbols "
                        "(After remapping of genes): {} -> {}.\n".format(
                            ",".join(cnvDF["HUGO_SYMBOL"][duplicated]),
                            ",".join(cnvDF["remapped"][duplicated]),
                        ))
        return (total_error, warning)
Пример #14
0
def main(
    process,
    project_id,
    center=None,
    pemfile=None,
    delete_old=False,
    only_validate=False,
    oncotree_link=None,
    genie_annotation_pkg=None,
    create_new_maf_database=False,
    debug=False,
    format_registry=None,
):

    syn = process_functions.synLogin(pemfile, debug=debug)

    # Get the Synapse Project where data is stored
    # Should have annotations to find the table lookup
    project = syn.get(project_id)
    database_to_synid_mapping_synid = project.annotations.get("dbMapping", "")

    databaseToSynIdMapping = syn.tableQuery(
        "SELECT * FROM {}".format(database_to_synid_mapping_synid[0])
    )
    databaseToSynIdMappingDf = databaseToSynIdMapping.asDataFrame()

    center_mapping_id = process_functions.getDatabaseSynId(
        syn, "centerMapping", databaseToSynIdMappingDf=databaseToSynIdMappingDf
    )

    center_mapping = syn.tableQuery("SELECT * FROM %s" % center_mapping_id)
    center_mapping_df = center_mapping.asDataFrame()

    if center is not None:
        assert (
            center in center_mapping_df.center.tolist()
        ), "Must specify one of these centers: {}".format(
            ", ".join(center_mapping_df.center)
        )
        centers = [center]
    else:
        # exclude_sites = ['JHU', 'DFCI', 'GRCC', 'VICC', 'NKI', 'MSK',
        #                  'UHN', 'MDA', 'WAKE', 'YALE', 'UCSF', 'CRUK',
        #                  'CHOP', 'VHIO', 'SCI', 'PHS', 'COLU', 'UCHI']
        center_mapping_df = center_mapping_df[~center_mapping_df["inputSynId"].isnull()]
        # release is a bool column
        center_mapping_df = center_mapping_df[center_mapping_df["release"]]
        # center_mapping_df = center_mapping_df[
        #     ~center_mapping_df['center'].isin(exclude_sites)
        # ]
        centers = center_mapping_df.center

    if oncotree_link is None:
        onco_link = databaseToSynIdMappingDf["Id"][
            databaseToSynIdMappingDf["Database"] == "oncotreeLink"
        ].values[0]
        onco_link_ent = syn.get(onco_link)
        oncotree_link = onco_link_ent.externalURL
    # Check if you can connect to oncotree link,
    # if not then don't run validation / processing
    process_functions.checkUrl(oncotree_link)

    center_mapping_ent = syn.get(center_mapping_id)
    if center_mapping_ent.get("isProcessing", ["True"])[0] == "True":
        raise Exception(
            "Processing/validation is currently happening.  "
            "Please change/add the 'isProcessing' annotation on {} "
            "to False to enable processing".format(center_mapping_id)
        )
    else:
        center_mapping_ent.isProcessing = "True"
        center_mapping_ent = syn.store(center_mapping_ent)
    # remove this query timeout and see what happens
    # syn.table_query_timeout = 50000

    # Create new maf database, should only happen once if its specified
    if create_new_maf_database:
        today = date.today()
        table_name = f"Narrow MAF Database - {today}"
        # filetype = "vcf2maf"
        # syn7208886 is the GENIE staging project to archive maf table
        new_tables = process_functions.create_new_fileformat_table(
            syn, "vcf2maf", table_name, project_id, "syn7208886"
        )
        syn.setPermissions(new_tables["newdb_ent"].id, 3326313, [])
        databaseToSynIdMappingDf = new_tables["newdb_mappingdf"]

    format_registry = config.collect_format_types(args.format_registry_packages)
    for process_center in centers:
        input_to_database.center_input_to_database(
            syn,
            project_id,
            process_center,
            process,
            only_validate,
            databaseToSynIdMappingDf,
            center_mapping_df,
            delete_old=delete_old,
            oncotree_link=oncotree_link,
            format_registry=format_registry,
            genie_annotation_pkg=genie_annotation_pkg,
        )

    # To ensure that this is the new entity
    center_mapping_ent = syn.get(center_mapping_id)
    center_mapping_ent.isProcessing = "False"
    center_mapping_ent = syn.store(center_mapping_ent)

    error_tracker_synid = process_functions.getDatabaseSynId(
        syn, "errorTracker", databaseToSynIdMappingDf=databaseToSynIdMappingDf
    )
    # Only write out invalid reasons if the center
    # isnt specified and if only validate
    if center is None and only_validate:
        logger.info("WRITING INVALID REASONS TO CENTER STAGING DIRS")
        write_invalid_reasons.write(syn, center_mapping_df, error_tracker_synid)
    logger.info("INPUT TO DATABASE COMPLETE")
Пример #15
0
def main(process, project_config=None, center=None, pemfile=None,
         delete_old=False, only_validate=False, oncotree_link=None,
         create_new_maf_database=False, testing=False, debug=False,
         reference=None, vcf2maf_path=None, vep_path=None,
         vep_data=None, thread=1, format_registry=config.PROCESS_FILES):

    syn = process_functions.synLogin(pemfile, debug=debug)

    try:
        # Must specify correct paths to vcf2maf, VEP and VEP data
        # if trying to process vcf, maf and mafSP
        if process in ['vcf', 'maf', 'mafSP'] and not only_validate:
            assert os.path.exists(vcf2maf_path), (
                "Path to vcf2maf (--vcf2mafPath) must be specified "
                "if `--process {vcf,maf,mafSP}` is used")
            assert os.path.exists(vep_path), (
                "Path to VEP (--vepPath) must be specified "
                "if `--process {vcf,maf,mafSP}` is used")
            assert os.path.exists(vep_data), (
                "Path to VEP data (--vepData) must be specified "
                "if `--process {vcf,maf,mafSP}` is used")

        databaseToSynIdMapping = syn.tableQuery('SELECT * FROM {}'.format(project_config.get('database_to_synid_mapping')))
        databaseToSynIdMappingDf = databaseToSynIdMapping.asDataFrame()

        center_mapping_id = process_functions.getDatabaseSynId(
            syn, "centerMapping",
            databaseToSynIdMappingDf=databaseToSynIdMappingDf)

        center_mapping = syn.tableQuery('SELECT * FROM %s' % center_mapping_id)
        center_mapping_df = center_mapping.asDataFrame()

        if center is not None:
            assert center in center_mapping_df.center.tolist(), (
                "Must specify one of these centers: {}".format(
                    ", ".join(center_mapping_df.center)))
            centers = [center]
        else:
            center_mapping_df = \
                center_mapping_df[~center_mapping_df['inputSynId'].isnull()]
            # release is a bool column
            center_mapping_df = center_mapping_df[center_mapping_df['release']]
            centers = center_mapping_df.center

        if oncotree_link is None:
            onco_link = databaseToSynIdMappingDf['Id'][
                databaseToSynIdMappingDf['Database'] == 'oncotreeLink'].values[0]
            onco_link_ent = syn.get(onco_link)
            oncotree_link = onco_link_ent.externalURL
        # Check if you can connect to oncotree link,
        # if not then don't run validation / processing
        process_functions.checkUrl(oncotree_link)

        currently_processing = get_processing_status(syn, center_mapping_id)
        
        if currently_processing:
            logger.error(
                "Processing/validation is currently happening.  "
                "Please change/add the 'isProcessing' annotation on {} "
                "to False to enable processing".format(center_mapping_id))
            sys.exit(1)
        else:
            status = set_processing_status(syn, center_mapping_id, status=True)
        # remove this query timeout and see what happens
        # syn.table_query_timeout = 50000

        # Create new maf database, should only happen once if its specified
        if create_new_maf_database:
            databaseToSynIdMappingDf = \
                input_to_database.create_and_archive_maf_database(syn, databaseToSynIdMappingDf)

        format_registry = config.collect_format_types(args.format_registry_packages)
        logger.debug("Using {format_registry} file formats.".format(
            format_registry=format_registry))

        for center in centers:
            input_to_database.center_input_to_database(
                syn, center, process,
                testing, only_validate,
                vcf2maf_path, vep_path,
                vep_data, databaseToSynIdMappingDf,
                center_mapping_df, reference=reference,
                delete_old=delete_old,
                oncotree_link=oncotree_link,
                thread=thread, format_registry=format_registry)

        # To ensure that this is the new entity
        center_mapping_ent = syn.get(center_mapping_id)
        center_mapping_ent.isProcessing = "False"
        center_mapping_ent = syn.store(center_mapping_ent)

        error_tracker_synid = process_functions.getDatabaseSynId(
            syn, "errorTracker", databaseToSynIdMappingDf=databaseToSynIdMappingDf)
        # Only write out invalid reasons if the center
        # isnt specified and if only validate
        if center is None and only_validate:
            logger.info("WRITING INVALID REASONS TO CENTER STAGING DIRS")
            write_invalid_reasons.write_invalid_reasons(
                syn, center_mapping_df, error_tracker_synid)
    except Exception as e:
        raise e
    finally:
        _ = set_processing_status(syn, center_mapping_id, status=False)