def import_study_data(jvm_args, meta_filename, data_filename): args = jvm_args.split(' ') meta_file_dict, meta_file_type = cbioportal_common.parse_metadata_file( meta_filename, logger=LOGGER) if meta_file_type is None: # invalid file, skip return if not data_filename.endswith(meta_file_dict['data_filename']): print >> ERROR_FILE, ("'data_filename' in meta file contradicts " "data filename in command, skipping file") return importer = IMPORTER_CLASSNAME_BY_META_TYPE[meta_file_type] args.append(importer) if IMPORTER_REQUIRES_METADATA[importer]: args.append("--meta") args.append(meta_filename) args.append("--loadMode") args.append("bulkload") if importer in ("org.mskcc.cbio.portal.scripts.ImportMutSigData", "org.mskcc.cbio.portal.scripts.ImportGisticData"): args.append("--data") args.append(data_filename) args.append("--study") args.append(meta_file_dict['cancer_study_identifier']) else: args.append("--data") args.append(data_filename) args.append("--noprogress") # don't report memory usage and % progress run_java(*args)
def remove_study(jvm_args, meta_filename): args = jvm_args.split(" ") args.append(REMOVE_STUDY_CLASS) meta_dict, meta_type = cbioportal_common.parse_metadata_file(meta_filename, logger=LOGGER) if meta_type != MetaFileTypes.STUDY: # invalid file, skip print >> ERROR_FILE, "Not a study meta file: " + meta_filename return args.append(meta_dict["cancer_study_identifier"]) args.append("--noprogress") # don't report memory usage and % progress run_java(*args)
def remove_study(jvm_args, meta_filename): args = jvm_args.split(' ') args.append(REMOVE_STUDY_CLASS) meta_dictionary = cbioportal_common.parse_metadata_file(meta_filename, logger=LOGGER) if meta_dictionary['meta_file_type'] != MetaFileTypes.STUDY: # invalid file, skip print >> ERROR_FILE, 'Not a study meta file: ' + meta_filename return args.append(meta_dictionary['cancer_study_identifier']) args.append("--noprogress") # don't report memory usage and % progress run_java(*args)
def import_study_data(jvm_args, meta_filename, data_filename, meta_file_dictionary=None): args = jvm_args.split(' ') # In case the meta file is already parsed in a previous function, it is not # necessary to parse it again if meta_file_dictionary is None: meta_file_dictionary = cbioportal_common.parse_metadata_file( meta_filename, logger=LOGGER) # Retrieve meta file type meta_file_type = meta_file_dictionary['meta_file_type'] # invalid file, skip if meta_file_type is None: print >> ERROR_FILE, ( "Unrecognized meta file type '%s', skipping file" % (meta_file_type)) return if not data_filename.endswith(meta_file_dictionary['data_filename']): print >> ERROR_FILE, ("'data_filename' in meta file contradicts " "data filename in command, skipping file") return importer = IMPORTER_CLASSNAME_BY_META_TYPE[meta_file_type] args.append(importer) if IMPORTER_REQUIRES_METADATA[importer]: args.append("--meta") args.append(meta_filename) args.append("--loadMode") args.append("bulkload") if importer in ("org.mskcc.cbio.portal.scripts.ImportMutSigData", "org.mskcc.cbio.portal.scripts.ImportGisticData"): args.append("--data") args.append(data_filename) args.append("--study") args.append(meta_file_dictionary['cancer_study_identifier']) elif importer == "org.mskcc.cbio.portal.scripts.ImportGenePanelProfileMap": args.append("--meta") args.append(meta_filename) args.append("--data") args.append(data_filename) else: args.append("--data") args.append(data_filename) args.append("--noprogress") # don't report memory usage and % progress run_java(*args)
def import_study_data(jvm_args, meta_filename, data_filename, meta_file_dictionary = None): args = jvm_args.split(' ') # In case the meta file is already parsed in a previous function, it is not # necessary to parse it again if meta_file_dictionary is None: meta_file_dictionary = cbioportal_common.parse_metadata_file( meta_filename, logger=LOGGER) # Retrieve meta file type meta_file_type = meta_file_dictionary['meta_file_type'] # invalid file, skip if meta_file_type is None: print >> ERROR_FILE, ("Unrecognized meta file type '%s', skipping file" % (meta_file_type)) return if not data_filename.endswith(meta_file_dictionary['data_filename']): print >> ERROR_FILE, ("'data_filename' in meta file contradicts " "data filename in command, skipping file") return importer = IMPORTER_CLASSNAME_BY_META_TYPE[meta_file_type] args.append(importer) if IMPORTER_REQUIRES_METADATA[importer]: args.append("--meta") args.append(meta_filename) args.append("--loadMode") args.append("bulkload") if importer in ("org.mskcc.cbio.portal.scripts.ImportMutSigData", "org.mskcc.cbio.portal.scripts.ImportGisticData"): args.append("--data") args.append(data_filename) args.append("--study") args.append(meta_file_dictionary['cancer_study_identifier']) elif importer == "org.mskcc.cbio.portal.scripts.ImportGenePanelProfileMap": args.append("--meta") args.append(meta_filename) args.append("--data") args.append(data_filename) else: args.append("--data") args.append(data_filename) args.append("--noprogress") # don't report memory usage and % progress run_java(*args)
def process_directory(jvm_args, study_directory): """Import an entire study directory based on meta files found.""" meta_filenames = ( os.path.join(study_directory, f) for f in os.listdir(study_directory) if re.search(r'(\b|_)meta(\b|[_0-9])', f, flags=re.IGNORECASE) and not (f.startswith('.') or f.endswith('~'))) study_id = None study_metafile = None study_metadata = None cancer_type_filepairs = [] sample_attr_filepair = None regular_filepairs = [] # read all meta files (excluding case lists) to determine what to import for f in meta_filenames: # parse meta file metadata, meta_file_type = cbioportal_common.parse_metadata_file( f, study_id=study_id, logger=LOGGER) if meta_file_type is None: # invalid meta file, let's die raise RuntimeError('Invalid meta file: ' + f) # remember study id to give an error in case any other file is referencing a different one if study_id is None and 'cancer_study_identifier' in metadata: study_id = metadata['cancer_study_identifier'] if meta_file_type == MetaFileTypes.CANCER_TYPE: cancer_type_filepairs.append( (f, os.path.join(study_directory, metadata['data_filename']))) elif meta_file_type == MetaFileTypes.STUDY: if study_metafile is not None: raise RuntimeError( 'Multiple meta_study files found: {} and {}'.format( study_metafile, f)) study_metafile = f study_metadata = metadata elif meta_file_type == MetaFileTypes.SAMPLE_ATTRIBUTES: if sample_attr_filepair is not None: raise RuntimeError( 'Multiple sample attribute files found: {} and {}'.format( sample_attr_filepair[0], f)) sample_attr_filepair = ( f, os.path.join(study_directory, metadata['data_filename'])) else: regular_filepairs.append( (f, os.path.join(study_directory, metadata['data_filename']))) # First, import cancer types for meta_filename, data_filename in cancer_type_filepairs: import_cancer_type(jvm_args, data_filename) # Then define the study if study_metafile is None: raise RuntimeError('No meta_study file found') else: # First remove study if exists remove_study(jvm_args, study_metafile) import_study(jvm_args, study_metafile) # Next, we need to import sample definitions if sample_attr_filepair is None: raise RuntimeError('No sample attribute file found') else: meta_filename, data_filename = sample_attr_filepair import_study_data(jvm_args, meta_filename, data_filename) # Now, import everything else for meta_filename, data_filename in regular_filepairs: import_study_data(jvm_args, meta_filename, data_filename) # do the case lists case_list_dirname = os.path.join(study_directory, 'case_lists') if os.path.isdir(case_list_dirname): process_case_lists(jvm_args, case_list_dirname) if study_metadata.get('add_global_case_list', 'false').lower() == 'true': add_global_case_list(jvm_args, study_id) # enable study update_study_status(jvm_args, study_id)
def process_directory(jvm_args, study_directory): """ Import an entire study directory based on meta files found. 1. Determine meta files in study directory. 2. Read all meta files and determine file types. 3. Import data files in specific order by file type. """ study_id = None study_meta_filename = None study_meta_dictionary = {} cancer_type_filepairs = [] sample_attr_filepair = None regular_filepairs = [] gene_panel_matrix_filepair = None zscore_filepairs = [] gsva_score_filepair = None gsva_pvalue_filepair = None fusion_filepair = None # Determine meta filenames in study directory meta_filenames = ( os.path.join(study_directory, meta_filename) for meta_filename in os.listdir(study_directory) if re.search(r'(\b|_)meta(\b|[_0-9])', meta_filename, flags=re.IGNORECASE) and not (meta_filename.startswith('.') or meta_filename.endswith('~'))) # Read all meta files (excluding case lists) to determine what to import for meta_filename in meta_filenames: # Parse meta file meta_dictionary = cbioportal_common.parse_metadata_file( meta_filename, study_id=study_id, logger=LOGGER) # Save meta dictionary in study meta dictionary study_meta_dictionary[meta_filename] = meta_dictionary # Retrieve meta file type meta_file_type = meta_dictionary['meta_file_type'] if meta_file_type is None: # invalid meta file, let's die raise RuntimeError('Invalid meta file: ' + meta_filename) # remember study id to give an error in case any other file is referencing a different one if study_id is None and 'cancer_study_identifier' in meta_dictionary: study_id = meta_dictionary['cancer_study_identifier'] # Check the type of metafile. It is to know which metafile types the # study contains because at a later stage we want to import in a # specific order. # Check for cancer type file if meta_file_type == MetaFileTypes.CANCER_TYPE: cancer_type_filepairs.append( (meta_filename, os.path.join(study_directory, meta_dictionary['data_filename']))) # Check for meta study file elif meta_file_type == MetaFileTypes.STUDY: if study_meta_filename is not None: raise RuntimeError( 'Multiple meta_study files found: {} and {}'.format( study_meta_filename, meta_filename)) # Determine the study meta filename study_meta_filename = meta_filename study_meta_dictionary[study_meta_filename] = meta_dictionary # Check for sample attributes elif meta_file_type == MetaFileTypes.SAMPLE_ATTRIBUTES: if sample_attr_filepair is not None: raise RuntimeError( 'Multiple sample attribute files found: {} and {}'.format( sample_attr_filepair[0], meta_filename)) # pylint: disable=unsubscriptable-object sample_attr_filepair = (meta_filename, os.path.join( study_directory, meta_dictionary['data_filename'])) # Check for gene panel matrix elif meta_file_type == MetaFileTypes.GENE_PANEL_MATRIX: gene_panel_matrix_filepair = (( meta_filename, os.path.join(study_directory, meta_dictionary['data_filename']))) # Check for z-score exression files elif meta_file_type == MetaFileTypes.EXPRESSION and meta_dictionary[ 'datatype'] == "Z-SCORE": zscore_filepairs.append( (meta_filename, os.path.join(study_directory, meta_dictionary['data_filename']))) # Check for GSVA scores elif meta_file_type == MetaFileTypes.GSVA_SCORES: gsva_score_filepair = ((meta_filename, os.path.join( study_directory, meta_dictionary['data_filename']))) # Check for GSVA p-values elif meta_file_type == MetaFileTypes.GSVA_PVALUES: gsva_pvalue_filepair = ((meta_filename, os.path.join( study_directory, meta_dictionary['data_filename']))) # Check for fusion data elif meta_file_type == MetaFileTypes.FUSION: fusion_filepair = ((meta_filename, os.path.join( study_directory, meta_dictionary['data_filename']))) # Add all other types of data else: regular_filepairs.append( (meta_filename, os.path.join(study_directory, meta_dictionary['data_filename']))) # First, import cancer types for meta_filename, data_filename in cancer_type_filepairs: import_cancer_type(jvm_args, data_filename) # Then define the study if study_meta_filename is None: raise RuntimeError('No meta_study file found') else: # First remove study if exists remove_study(jvm_args, study_meta_filename) import_study(jvm_args, study_meta_filename) # Next, we need to import sample definitions if sample_attr_filepair is None: raise RuntimeError('No sample attribute file found') else: meta_filename, data_filename = sample_attr_filepair import_study_data(jvm_args, meta_filename, data_filename, study_meta_dictionary[meta_filename]) # Next, import everything else except gene panel, fusion data, GSVA and # z-score expression. If in the future more types refer to each other, (like # in a tree structure) this could be programmed in a recursive fashion. for meta_filename, data_filename in regular_filepairs: import_study_data(jvm_args, meta_filename, data_filename, study_meta_dictionary[meta_filename]) # Import fusion data (after mutation) if fusion_filepair is not None: meta_filename, data_filename = fusion_filepair import_study_data(jvm_args, meta_filename, data_filename, study_meta_dictionary[meta_filename]) # Import expression z-score (after expression) for meta_filename, data_filename in zscore_filepairs: import_study_data(jvm_args, meta_filename, data_filename, study_meta_dictionary[meta_filename]) # Import GSVA genetic profiles (after expression and z-scores) if gsva_score_filepair is not None: # First import the GSVA score data meta_filename, data_filename = gsva_score_filepair import_study_data(jvm_args, meta_filename, data_filename, study_meta_dictionary[meta_filename]) # Second import the GSVA p-value data meta_filename, data_filename = gsva_pvalue_filepair import_study_data(jvm_args, meta_filename, data_filename, study_meta_dictionary[meta_filename]) if gene_panel_matrix_filepair is not None: meta_filename, data_filename = gene_panel_matrix_filepair import_study_data(jvm_args, meta_filename, data_filename, study_meta_dictionary[meta_filename]) # Import the case lists case_list_dirname = os.path.join(study_directory, 'case_lists') if os.path.isdir(case_list_dirname): process_case_lists(jvm_args, case_list_dirname) if study_meta_dictionary[study_meta_filename].get( 'add_global_case_list', 'false').lower() == 'true': add_global_case_list(jvm_args, study_id) # enable study update_study_status(jvm_args, study_id)
def process_directory(jvm_args, study_directory): """Import an entire study directory based on meta files found.""" meta_filenames = ( os.path.join(study_directory, f) for f in os.listdir(study_directory) if re.search(r'(\b|_)meta(\b|[_0-9])', f, flags=re.IGNORECASE) and not (f.startswith('.') or f.endswith('~'))) study_id = None study_metafile = None study_metadata = None cancer_type_filepairs = [] sample_attr_filepair = None regular_filepairs = [] gene_panel_matrix_filepair = None zscore_filepairs = [] gsva_score_filepair = None gsva_pvalue_filepair = None # read all meta files (excluding case lists) to determine what to import for f in meta_filenames: # parse meta file metadata, meta_file_type = cbioportal_common.parse_metadata_file( f, study_id=study_id, logger=LOGGER) if meta_file_type is None: # invalid meta file, let's die raise RuntimeError('Invalid meta file: ' + f) # remember study id to give an error in case any other file is referencing a different one if study_id is None and 'cancer_study_identifier' in metadata: study_id = metadata['cancer_study_identifier'] if meta_file_type == MetaFileTypes.CANCER_TYPE: cancer_type_filepairs.append( (f, os.path.join(study_directory, metadata['data_filename']))) elif meta_file_type == MetaFileTypes.STUDY: if study_metafile is not None: raise RuntimeError( 'Multiple meta_study files found: {} and {}'.format( study_metafile, f)) study_metafile = f study_metadata = metadata elif meta_file_type == MetaFileTypes.SAMPLE_ATTRIBUTES: if sample_attr_filepair is not None: raise RuntimeError( 'Multiple sample attribute files found: {} and {}'.format( sample_attr_filepair[0], f)) # pylint: disable=unsubscriptable-object sample_attr_filepair = (f, os.path.join(study_directory, metadata['data_filename'])) elif meta_file_type == MetaFileTypes.GENE_PANEL_MATRIX: gene_panel_matrix_filepair = ((f, os.path.join( study_directory, metadata['data_filename']))) elif meta_file_type == MetaFileTypes.EXPRESSION and metadata[ 'datatype'] == "Z-SCORE": zscore_filepairs.append( (f, os.path.join(study_directory, metadata['data_filename']))) elif meta_file_type == MetaFileTypes.GSVA_SCORES: gsva_score_filepair = ((f, os.path.join(study_directory, metadata['data_filename']))) elif meta_file_type == MetaFileTypes.GSVA_PVALUES: gsva_pvalue_filepair = ((f, os.path.join(study_directory, metadata['data_filename']))) else: regular_filepairs.append( (f, os.path.join(study_directory, metadata['data_filename']))) # First, import cancer types for meta_filename, data_filename in cancer_type_filepairs: import_cancer_type(jvm_args, data_filename) # Then define the study if study_metafile is None: raise RuntimeError('No meta_study file found') else: # First remove study if exists remove_study(jvm_args, study_metafile) import_study(jvm_args, study_metafile) # Next, we need to import sample definitions if sample_attr_filepair is None: raise RuntimeError('No sample attribute file found') else: meta_filename, data_filename = sample_attr_filepair import_study_data(jvm_args, meta_filename, data_filename) # Now, import everything else except gene panel, gsva & z-score expression # These data types have to be imported last, because they both refer to expression by # source_stable_id. If in the future more types refer to each other, (like # in a tree structure) this could be programmed in a recursive fashion. for meta_filename, data_filename in regular_filepairs: import_study_data(jvm_args, meta_filename, data_filename) # Now import expression z-score for meta_filename, data_filename in zscore_filepairs: import_study_data(jvm_args, meta_filename, data_filename) # Now import gsva genetic profiles if gsva_score_filepair is not None: # First import the score data meta_filename, data_filename = gsva_score_filepair import_study_data(jvm_args, meta_filename, data_filename) #Second import the pvalue data meta_filename, data_filename = gsva_pvalue_filepair import_study_data(jvm_args, gsva_pvalue_filepair[0], gsva_pvalue_filepair[1]) if gene_panel_matrix_filepair is not None: import_study_data(jvm_args, gene_panel_matrix_filepair[0], gene_panel_matrix_filepair[1]) # do the case lists case_list_dirname = os.path.join(study_directory, 'case_lists') if os.path.isdir(case_list_dirname): process_case_lists(jvm_args, case_list_dirname) if study_metadata.get('add_global_case_list', 'false').lower() == 'true': add_global_case_list(jvm_args, study_id) # enable study update_study_status(jvm_args, study_id)
def process_directory(jvm_args, study_directory): """Import an entire study directory based on meta files found.""" meta_filenames = ( os.path.join(study_directory, f) for f in os.listdir(study_directory) if re.search(r'(\b|_)meta(\b|[_0-9])', f, flags=re.IGNORECASE) and not (f.startswith('.') or f.endswith('~'))) study_id = None study_metafile = None study_metadata = None cancer_type_filepairs = [] sample_attr_filepair = None regular_filepairs = [] # read all meta files (excluding case lists) to determine what to import for f in meta_filenames: # parse meta file metadata, meta_file_type = cbioportal_common.parse_metadata_file( f, study_id=study_id, logger=LOGGER) if meta_file_type is None: # invalid meta file, let's die raise RuntimeError('Invalid meta file: ' + f) # remember study id to give an error in case any other file is referencing a different one if study_id is None and 'cancer_study_identifier' in metadata: study_id = metadata['cancer_study_identifier'] if meta_file_type == MetaFileTypes.CANCER_TYPE: cancer_type_filepairs.append( (f, os.path.join(study_directory, metadata['data_filename']))) elif meta_file_type == MetaFileTypes.STUDY: if study_metafile is not None: raise RuntimeError( 'Multiple meta_study files found: {} and {}'.format( study_metafile, f)) study_metafile = f study_metadata = metadata elif meta_file_type == MetaFileTypes.SAMPLE_ATTRIBUTES: if sample_attr_filepair is not None: raise RuntimeError( 'Multiple sample attribute files found: {} and {}'.format( sample_attr_filepair[0], f)) sample_attr_filepair = (f, os.path.join(study_directory, metadata['data_filename'])) else: regular_filepairs.append( (f, os.path.join(study_directory, metadata['data_filename']))) # First, import cancer types for meta_filename, data_filename in cancer_type_filepairs: import_cancer_type(jvm_args, data_filename) # Then define the study if study_metafile is None: raise RuntimeError('No meta_study file found') else: # First remove study if exists remove_study(jvm_args, study_metafile) import_study(jvm_args, study_metafile) # Next, we need to import sample definitions if sample_attr_filepair is None: raise RuntimeError('No sample attribute file found') else: meta_filename, data_filename = sample_attr_filepair import_study_data(jvm_args, meta_filename, data_filename) # Now, import everything else for meta_filename, data_filename in regular_filepairs: import_study_data(jvm_args, meta_filename, data_filename) # do the case lists case_list_dirname = os.path.join(study_directory, 'case_lists') if os.path.isdir(case_list_dirname): process_case_lists(jvm_args, case_list_dirname) if study_metadata.get('add_global_case_list', 'false').lower() == 'true': add_global_case_list(jvm_args, study_id) # enable study update_study_status(jvm_args, study_id)
def process_directory(jvm_args, study_directory): """Import an entire study directory based on meta files found.""" meta_filenames = ( os.path.join(study_directory, f) for f in os.listdir(study_directory) if re.search(r'(\b|_)meta(\b|[_0-9])', f, flags=re.IGNORECASE) and not (f.startswith('.') or f.endswith('~'))) study_id = None study_metafile = None study_metadata = None cancer_type_filepairs = [] sample_attr_filepair = None regular_filepairs = [] gene_panel_matrix_filepair = None zscore_filepairs = [] gsva_score_filepair = None gsva_pvalue_filepair = None # read all meta files (excluding case lists) to determine what to import for f in meta_filenames: # parse meta file metadata, meta_file_type = cbioportal_common.parse_metadata_file( f, study_id=study_id, logger=LOGGER) if meta_file_type is None: # invalid meta file, let's die raise RuntimeError('Invalid meta file: ' + f) # remember study id to give an error in case any other file is referencing a different one if study_id is None and 'cancer_study_identifier' in metadata: study_id = metadata['cancer_study_identifier'] if meta_file_type == MetaFileTypes.CANCER_TYPE: cancer_type_filepairs.append( (f, os.path.join(study_directory, metadata['data_filename']))) elif meta_file_type == MetaFileTypes.STUDY: if study_metafile is not None: raise RuntimeError( 'Multiple meta_study files found: {} and {}'.format( study_metafile, f)) study_metafile = f study_metadata = metadata elif meta_file_type == MetaFileTypes.SAMPLE_ATTRIBUTES: if sample_attr_filepair is not None: raise RuntimeError( 'Multiple sample attribute files found: {} and {}'.format( sample_attr_filepair[0], f)) # pylint: disable=unsubscriptable-object sample_attr_filepair = ( f, os.path.join(study_directory, metadata['data_filename'])) elif meta_file_type == MetaFileTypes.GENE_PANEL_MATRIX: gene_panel_matrix_filepair = ( (f, os.path.join(study_directory, metadata['data_filename']))) elif meta_file_type == MetaFileTypes.EXPRESSION and metadata['datatype'] == "Z-SCORE": zscore_filepairs.append( (f, os.path.join(study_directory, metadata['data_filename']))) elif meta_file_type == MetaFileTypes.GSVA_SCORES: gsva_score_filepair = ( (f, os.path.join(study_directory, metadata['data_filename']))) elif meta_file_type == MetaFileTypes.GSVA_PVALUES: gsva_pvalue_filepair = ( (f, os.path.join(study_directory, metadata['data_filename']))) else: regular_filepairs.append( (f, os.path.join(study_directory, metadata['data_filename']))) # First, import cancer types for meta_filename, data_filename in cancer_type_filepairs: import_cancer_type(jvm_args, data_filename) # Then define the study if study_metafile is None: raise RuntimeError('No meta_study file found') else: # First remove study if exists remove_study(jvm_args, study_metafile) import_study(jvm_args, study_metafile) # Next, we need to import sample definitions if sample_attr_filepair is None: raise RuntimeError('No sample attribute file found') else: meta_filename, data_filename = sample_attr_filepair import_study_data(jvm_args, meta_filename, data_filename) # Now, import everything else except gene panel, gsva & z-score expression # These data types have to be imported last, because they both refer to expression by # source_stable_id. If in the future more types refer to each other, (like # in a tree structure) this could be programmed in a recursive fashion. for meta_filename, data_filename in regular_filepairs: import_study_data(jvm_args, meta_filename, data_filename) # Now import expression z-score for meta_filename, data_filename in zscore_filepairs: import_study_data(jvm_args, meta_filename, data_filename) # Now import gsva genetic profiles if gsva_score_filepair is not None: # First import the score data meta_filename, data_filename = gsva_score_filepair import_study_data(jvm_args, meta_filename, data_filename) #Second import the pvalue data meta_filename, data_filename = gsva_pvalue_filepair import_study_data(jvm_args, gsva_pvalue_filepair[0], gsva_pvalue_filepair[1]) if gene_panel_matrix_filepair is not None: import_study_data(jvm_args, gene_panel_matrix_filepair[0], gene_panel_matrix_filepair[1]) # do the case lists case_list_dirname = os.path.join(study_directory, 'case_lists') if os.path.isdir(case_list_dirname): process_case_lists(jvm_args, case_list_dirname) if study_metadata.get('add_global_case_list', 'false').lower() == 'true': add_global_case_list(jvm_args, study_id) # enable study update_study_status(jvm_args, study_id)
def process_directory(jvm_args, study_directory): """ Import an entire study directory based on meta files found. 1. Determine meta files in study directory. 2. Read all meta files and determine file types. 3. Import data files in specific order by file type. """ study_id = None study_meta_filename = None study_meta_dictionary = {} cancer_type_filepairs = [] sample_attr_filepair = None regular_filepairs = [] gene_panel_matrix_filepair = None zscore_filepairs = [] gsva_score_filepair = None gsva_pvalue_filepair = None fusion_filepair = None # Determine meta filenames in study directory meta_filenames = ( os.path.join(study_directory, meta_filename) for meta_filename in os.listdir(study_directory) if re.search(r'(\b|_)meta(\b|[_0-9])', meta_filename, flags=re.IGNORECASE) and not (meta_filename.startswith('.') or meta_filename.endswith('~'))) # Read all meta files (excluding case lists) to determine what to import for meta_filename in meta_filenames: # Parse meta file meta_dictionary = cbioportal_common.parse_metadata_file( meta_filename, study_id=study_id, logger=LOGGER) # Save meta dictionary in study meta dictionary study_meta_dictionary[meta_filename] = meta_dictionary # Retrieve meta file type meta_file_type = meta_dictionary['meta_file_type'] if meta_file_type is None: # invalid meta file, let's die raise RuntimeError('Invalid meta file: ' + meta_filename) # remember study id to give an error in case any other file is referencing a different one if study_id is None and 'cancer_study_identifier' in meta_dictionary: study_id = meta_dictionary['cancer_study_identifier'] # Check the type of metafile. It is to know which metafile types the # study contains because at a later stage we want to import in a # specific order. # Check for cancer type file if meta_file_type == MetaFileTypes.CANCER_TYPE: cancer_type_filepairs.append( (meta_filename, os.path.join(study_directory, meta_dictionary['data_filename']))) # Check for meta study file elif meta_file_type == MetaFileTypes.STUDY: if study_meta_filename is not None: raise RuntimeError( 'Multiple meta_study files found: {} and {}'.format( study_meta_filename, meta_filename)) # Determine the study meta filename study_meta_filename = meta_filename study_meta_dictionary[study_meta_filename] = meta_dictionary # Check for sample attributes elif meta_file_type == MetaFileTypes.SAMPLE_ATTRIBUTES: if sample_attr_filepair is not None: raise RuntimeError( 'Multiple sample attribute files found: {} and {}'.format( sample_attr_filepair[0], meta_filename)) # pylint: disable=unsubscriptable-object sample_attr_filepair = ( meta_filename, os.path.join(study_directory, meta_dictionary['data_filename'])) # Check for gene panel matrix elif meta_file_type == MetaFileTypes.GENE_PANEL_MATRIX: gene_panel_matrix_filepair = ( (meta_filename, os.path.join(study_directory, meta_dictionary['data_filename']))) # Check for z-score exression files elif meta_file_type == MetaFileTypes.EXPRESSION and meta_dictionary['datatype'] == "Z-SCORE": zscore_filepairs.append( (meta_filename, os.path.join(study_directory, meta_dictionary['data_filename']))) # Check for GSVA scores elif meta_file_type == MetaFileTypes.GSVA_SCORES: gsva_score_filepair = ( (meta_filename, os.path.join(study_directory, meta_dictionary['data_filename']))) # Check for GSVA p-values elif meta_file_type == MetaFileTypes.GSVA_PVALUES: gsva_pvalue_filepair = ( (meta_filename, os.path.join(study_directory, meta_dictionary['data_filename']))) # Check for fusion data elif meta_file_type == MetaFileTypes.FUSION: fusion_filepair = ( (meta_filename, os.path.join(study_directory, meta_dictionary['data_filename']))) # Add all other types of data else: regular_filepairs.append( (meta_filename, os.path.join(study_directory, meta_dictionary['data_filename']))) # First, import cancer types for meta_filename, data_filename in cancer_type_filepairs: import_cancer_type(jvm_args, data_filename) # Then define the study if study_meta_filename is None: raise RuntimeError('No meta_study file found') else: # First remove study if exists remove_study(jvm_args, study_meta_filename) import_study(jvm_args, study_meta_filename) # Next, we need to import sample definitions if sample_attr_filepair is None: raise RuntimeError('No sample attribute file found') else: meta_filename, data_filename = sample_attr_filepair import_study_data(jvm_args, meta_filename, data_filename, study_meta_dictionary[meta_filename]) # Next, import everything else except gene panel, fusion data, GSVA and # z-score expression. If in the future more types refer to each other, (like # in a tree structure) this could be programmed in a recursive fashion. for meta_filename, data_filename in regular_filepairs: import_study_data(jvm_args, meta_filename, data_filename, study_meta_dictionary[meta_filename]) # Import fusion data (after mutation) if fusion_filepair is not None: meta_filename, data_filename = fusion_filepair import_study_data(jvm_args, meta_filename, data_filename, study_meta_dictionary[meta_filename]) # Import expression z-score (after expression) for meta_filename, data_filename in zscore_filepairs: import_study_data(jvm_args, meta_filename, data_filename, study_meta_dictionary[meta_filename]) # Import GSVA genetic profiles (after expression and z-scores) if gsva_score_filepair is not None: # First import the GSVA score data meta_filename, data_filename = gsva_score_filepair import_study_data(jvm_args, meta_filename, data_filename, study_meta_dictionary[meta_filename]) # Second import the GSVA p-value data meta_filename, data_filename = gsva_pvalue_filepair import_study_data(jvm_args, meta_filename, data_filename, study_meta_dictionary[meta_filename]) if gene_panel_matrix_filepair is not None: meta_filename, data_filename = gene_panel_matrix_filepair import_study_data(jvm_args, meta_filename, data_filename, study_meta_dictionary[meta_filename]) # Import the case lists case_list_dirname = os.path.join(study_directory, 'case_lists') if os.path.isdir(case_list_dirname): process_case_lists(jvm_args, case_list_dirname) if study_meta_dictionary[study_meta_filename].get('add_global_case_list', 'false').lower() == 'true': add_global_case_list(jvm_args, study_id) # enable study update_study_status(jvm_args, study_id)