def read_configuration(config_dir) -> SourcesConfig: """ Parse configuration files and return set of dictionaries :param config_dir: Path to directory where the configs are stored """ sources_config_path = path.join(config_dir, 'sources_config.json') if not path.exists(sources_config_path) or not path.isfile(sources_config_path): raise DataException(f'Cannot find {sources_config_path}') with open(sources_config_path, 'r') as sources_config_file: try: config_data = json.load(sources_config_file) except Exception as e: logger.error(e) raise DataException(f'Error parsing source config file: {sources_config_path}') return SourcesConfig(**config_data)
def validate_derived_values_not_in_source_config(entity_type: BaseModel, entity_source_config: Entity): derived_properties = set([name for (name, prop) in entity_type.schema()['properties'].items() if 'derived' in prop and prop['derived'] is True]) attribute_names = set([attr.name for attr in entity_source_config.attributes]) intersection = derived_properties.intersection(attribute_names) if intersection: raise DataException(f'Derived value fields not allowed in source files: {", ".join(intersection)}')
def read_id_property(self, entity_type) -> str: entity_sources_config = self.sources_config.entities[entity_type.__name__] source_columns = list([attribute.name for attribute in entity_sources_config.attributes]) logger.debug(f'Source columns: {source_columns}') schema = entity_type.schema() schema_columns = list(schema['properties'].keys()) logger.debug(f'Schema columns: {schema_columns}') invalid_columns = set(source_columns) - set(schema_columns) if invalid_columns: raise DataException(f'Unknown columns in source configuration: {invalid_columns}') return get_id_property(schema)
def validate_molecule_type_agrees_with_library_strategy( cls, library_strategy, values): if 'type' in values and library_strategy is not None: if values['type'] == 'DNA' and library_strategy.__contains__( 'RNA-Seq'): raise DataException( f'Not allowed RNA-Seq library strategy for molecule type: DNA' ) if values['type'] == 'RNA' and library_strategy.__contains__( 'WXS'): raise DataException( f'Not allowed WXS library strategy for molecule type: RNA') if values['type'] == 'RNA' and library_strategy.__contains__( 'WGS'): raise DataException( f'Not allowed WGS library strategy for molecule type: RNA') if values['type'] == 'RNA' and library_strategy.__contains__( 'DNA-meth_array'): raise DataException( f'Not allowed DNA-meth_array library strategy for molecule type: RNA' ) return library_strategy
def transform_entities(entities: Any, schema: Dict, constructor: Any): id_property = get_id_property(schema) entities = [transform_entity(entity_data, schema) for entity_data in entities] result = [] for entity in entities: try: result.append(constructor(entity)) except Exception as e: logger.error(e) entity_name = schema['title'] id = entity[id_property] raise DataException(f'Invalid data for {entity_name} with id {id}') return result
def get_source_files(entity_sources_config: Entity, id_property: str): source_files = set([source.file for attribute in entity_sources_config.attributes for source in attribute.sources]) source_file_id_mapping = dict([(source.file, source.column if source.column is not None else attribute.name) for attribute in entity_sources_config.attributes for source in attribute.sources if attribute.name == id_property]) logger.debug(f'Source files: {source_files}') logger.debug(f'Source file id mapping: {source_file_id_mapping}') source_files_without_id_column = source_files - set(source_file_id_mapping.keys()) if source_files_without_id_column: raise DataException(f'Id column missing in source files: {source_files_without_id_column}') return source_files, source_file_id_mapping
def process_mutation_data(ngs_dir: str, output_dir: str, clinical_sample_ids: List[str]) -> Set[str]: """ Reads data from all MAF files in ngs_dir and creates a combined mutation data file and the meta and caselist files for the mutation data. Returns the list of sample identifiers found in the mutation data. An exception is raised when any of the sample identifiers is not in the clinical_sample_ids list. """ mutation_samples = combine_maf( ngs_dir, os.path.join(output_dir, 'data_mutations.maf')) if mutation_samples: # Create meta file meta_filename = os.path.join(output_dir, 'meta_mutations.txt') create_meta_content(meta_filename, cancer_study_identifier=STUDY_ID, genetic_alteration_type='MUTATION_EXTENDED', datatype='MAF', stable_id='mutations', show_profile_in_analysis_tab='true', profile_name='Mutations', profile_description='Mutation data', data_filename='data_mutations.maf', variant_classification_filter='', swissprot_identifier='accession') # Create case list create_caselist(output_dir=output_dir, file_name='cases_sequenced.txt', cancer_study_identifier=STUDY_ID, stable_id='%s_sequenced' % STUDY_ID, case_list_name='Sequenced samples', case_list_description='All sequenced samples', case_list_category='all_cases_with_mutation_data', case_list_ids="\t".join(mutation_samples)) # Test for samples in MAF files that are not in clinical data if not set(clinical_sample_ids).issuperset(mutation_samples): logger.error( "Found samples in MAF files that are not in clinical data: {}". format(", ".join( mutation_samples.difference(set(clinical_sample_ids))))) raise DataException( 'Found samples in MAF files that are not in clinical data') return mutation_samples
def read_configuration(config_dir) -> OntologyConfig: """ Parse configuration files and return set of dictionaries :param config_dir: Path to directory where the configs are stored """ ontology_config_path = path.join(config_dir, 'ontology_config.json') if not path.exists(ontology_config_path) or not path.isfile( ontology_config_path): raise FileNotFoundError(f'Cannot find {ontology_config_path}') with open(ontology_config_path, 'r') as ontology_config_file: try: config_data = json.load(ontology_config_file) except Exception as e: logger.error(e) raise DataException( f'Error parsing ontology config file: {ontology_config_path}') return OntologyConfig(**config_data)
def process_cna_files(ngs_dir: str, output_dir: str, clinical_sample_ids: List[str]) -> List[str]: """ Reads CNA data files (segmented, continuous and discrete) from ngs_dir, copies the files, drops and renames certain columns, and writes meta and case list files. Returns list of CNA sample identifiers. An exception is raised when any of the sample identifiers is not in the clinical_sample_ids list. """ # Select all non-hidden files study_files = [] for study_file in os.listdir(ngs_dir): if not study_file.startswith('.'): study_files.append(study_file) # Create sample list, required for cnaseq case list cna_samples = [] # Transform CNA data files for study_file in study_files: if study_file.endswith('sha1'): continue # CNA Segment data if study_file.split('.')[-1] == 'seg': logger.debug('Transforming segment data: %s' % study_file) output_file = 'data_cna_segments.seg' # Read file and replace header with open(os.path.join(ngs_dir, study_file)) as segment_file: segment_lines = segment_file.readlines() segment_lines[ 0] = 'ID chrom loc.start loc.end num.mark seg.mean\n' # Write a copy with replaced header with open(os.path.join(output_dir, output_file), 'w') as segment_file: segment_file.writelines(segment_lines) # Create meta file meta_filename = os.path.join(output_dir, 'meta_cna_segments.txt') create_meta_content( meta_filename, cancer_study_identifier=STUDY_ID, genetic_alteration_type='COPY_NUMBER_ALTERATION', datatype='SEG', reference_genome_id='hg38', description='Segment data', data_filename=output_file) # CNA Continuous elif 'data_by_genes' in study_file: logger.debug('Transforming continuous CNA data: %s' % study_file) output_file = 'data_cna_continuous.txt' # Read file cna_data = pd.read_csv(os.path.join(ngs_dir, study_file), sep='\t', na_values=[''], dtype={'Gene ID': str}) # Remove column and rename column names cna_data.drop('Cytoband', axis=1, inplace=True) cna_data.rename(columns={ 'Gene Symbol': 'Hugo_Symbol', 'Gene ID': 'Entrez_Gene_Id' }, inplace=True) # Remove negative Entrez IDs. This can lead to incorrect mapping in cBioPortal for index, row in cna_data.iterrows(): if int(row['Entrez_Gene_Id']) < -1: cna_data.loc[index, 'Entrez_Gene_Id'] = '' cna_data.to_csv(os.path.join(output_dir, output_file), sep='\t', index=False, header=True) # Create meta file meta_filename = os.path.join(output_dir, 'meta_cna_continuous.txt') create_meta_content( meta_filename, cancer_study_identifier=STUDY_ID, genetic_alteration_type='COPY_NUMBER_ALTERATION', datatype='LOG2-VALUE', stable_id='log2CNA', show_profile_in_analysis_tab='false', profile_name='Copy-number alteration values', profile_description= 'Continuous copy-number alteration values for each gene.', data_filename=output_file) # Create case list cna_samples = cna_data.columns[2:].tolist() create_caselist(output_dir=output_dir, file_name='cases_cna.txt', cancer_study_identifier=STUDY_ID, stable_id='%s_cna' % STUDY_ID, case_list_name='CNA samples', case_list_description='All CNA samples', case_list_category='all_cases_with_cna_data', case_list_ids="\t".join(cna_samples)) # Test for samples in CNA files that are not in clinical data if not set(clinical_sample_ids).issuperset(set(cna_samples)): logger.error( "Found samples in CNA files that are not in clinical data: {}" .format(", ".join( set(cna_samples).difference( set(clinical_sample_ids))))) raise DataException( 'Found samples in CNA files that are not in clinical data') # CNA Discrete elif 'thresholded.by_genes' in study_file: logger.debug('Transforming discrete CNA data: %s' % study_file) output_file = 'data_cna_discrete.txt' # Read input file cna_data = pd.read_csv(os.path.join(ngs_dir, study_file), sep='\t', na_values=[''], dtype={'Gene ID': str}) # Remove column and rename column names cna_data.drop('Cytoband', axis=1, inplace=True) cna_data.rename(columns={ 'Gene Symbol': 'Hugo_Symbol', 'Locus ID': 'Entrez_Gene_Id' }, inplace=True) # Remove negative Entrez IDs. This can lead to incorrect mapping in cBioPortal for index, row in cna_data.iterrows(): if int(row['Entrez_Gene_Id']) < -1: cna_data.loc[index, 'Entrez_Gene_Id'] = '' cna_data.to_csv(os.path.join(output_dir, output_file), sep='\t', index=False, header=True) # Create meta file meta_filename = os.path.join(output_dir, 'meta_cna_discrete.txt') profile_description = 'Putative copy-number alteration values for each gene from GISTIC 2.0.' \ 'Values: -2 = homozygous deletion; -1 = hemizygous deletion;' \ '0 = neutral / no change; 1 = gain; 2 = high level amplification.' create_meta_content( meta_filename, cancer_study_identifier=STUDY_ID, genetic_alteration_type='COPY_NUMBER_ALTERATION', datatype='DISCRETE', stable_id='gistic', show_profile_in_analysis_tab='true', profile_name='Putative copy-number alterations from GISTIC', profile_description=profile_description, data_filename=output_file) elif study_file.split('.')[-2:] == ['maf', 'gz']: # Mutations file are transformed in an other loop pass else: logger.warning("Unknown file type: %s" % study_file) return cna_samples
def check_self_reference(cls, src_biomaterial_id, values): if src_biomaterial_id == values['biomaterial_id']: raise DataException(f'Biomaterial cannot be derived from itself') return src_biomaterial_id
def read_codebook(codebook_filename: str) -> CodeBook: """Process the content of a codebook and return the reformatted codebook as an object. Expected import format is a tab-delimited file. Format: - A header line is followed by one or more value mapping lines. - Header lines contain two columns: a number and a space separated list of column names. E.g., "1\tSEX GENDER". - Value mapping lines start with an empty field, followed by code, value pairs. E.g., "\t1\tMale\t2\tFemale". :param codebook_filename: file name of the code book :return: code book object """ with open(codebook_filename, 'r') as code_book_file: column_mappings: Dict[str, ColumnValueMapping] = {} current_value_mapping: Dict[Any, Any] = {} current_columns: Optional[List[str]] = None line_number = 0 for line in code_book_file: line_number += 1 line = line.strip('\n') if not line.startswith('\t'): # Save previous column mapping if current_columns is not None: column_value_mapping = ColumnValueMapping( value_mapping=current_value_mapping) for column in current_columns: column_mappings[column] = column_value_mapping # Start new column mapping tokens = line.split('\t') if len(tokens) < 2: raise DataException( f'Invalid header in codebook {codebook_filename} on line {line_number}' ) current_columns = [ column_name.lower() for column_name in tokens[1].split(' ') ] duplicate_columns = set(current_columns).intersection( column_mappings.keys()) if duplicate_columns: raise DataException( f'Duplicate columns in codebook on line {line_number}: {", ".join(duplicate_columns)}' ) current_value_mapping = {} else: # Add values to current value mapping tokens = line.split('\t')[1:] if len(tokens) % 2 != 0: raise DataException( f'Invalid value mapping in codebook {codebook_filename} on line {line_number}' ) it = iter(tokens) for code, value in zip(it, it): if code != '' and value != '': value = value.replace('"', '') if code in current_value_mapping: raise DataException( f'Duplicate code in codebook on line {line_number}: {code}' ) current_value_mapping[code] = value # Save last column mapping if current_columns is not None: column_value_mapping = ColumnValueMapping( value_mapping=current_value_mapping) for column in current_columns: column_mappings[column] = column_value_mapping return CodeBook(column_mappings=column_mappings)
def read_entity_data(self, entity_type) -> Sequence: """ Reads data for an entity type from the source files that are specified in the sources config file. :param entity_type: the entity type, e.g., Individual. :return: A sequence of entities. """ logger.info(f'* Reading {entity_type.__name__} data ...') if entity_type.__name__ not in self.sources_config.entities: raise DataException( f'No source configuration found for the {entity_type.__name__} entity' ) id_property = self.read_id_property(entity_type) entity_sources_config = self.sources_config.entities[ entity_type.__name__] validate_derived_values_not_in_source_config(entity_type, entity_sources_config) source_files, source_file_id_mapping = get_source_files( entity_sources_config, id_property) # Read data from source files source_data = {} entity_data = {} for source_file in source_files: source_file_data = self.read_source_file_data(source_file) if len(source_file_data) == 0: raise DataException(f'No records in {source_file}') source_id_column = source_file_id_mapping[source_file] record_number = 0 item_ids = set() for item in source_file_data: record_number += 1 if source_id_column not in item.keys(): raise DataException( f'Identifier column \'{source_id_column}\' not found in file {source_file}. ' f'Is the delimiter configured correctly in the sources config?' ) item_id = item.get(source_id_column, None) if item_id is None or item_id == '': raise DataException( f'Empty identifier in {source_file} record number {record_number}' ) if item_id in item_ids: raise DataException( f'Duplicate identifier in {source_file} record number {record_number}' ) item_ids.add(item_id) if item_id not in entity_data: entity_data[item_id] = {id_property: item_id} source_data[source_file] = source_file_data logger.debug(f'{entity_type.__name__} entity data: {entity_data}') # Merge data from different sources files for attribute in entity_sources_config.attributes: if attribute.name == id_property: continue # add data to entities for attribute logger.debug(f'Adding data for attribute {attribute.name}') for source in attribute.sources: # default column name is the attribute name source_column = source.column if source.column is not None else attribute.name # check if column is in the source data first_record = source_data[source.file][0] if source_column not in first_record.keys(): raise DataException( f'Column \'{source_column}\' not found in file {source.file}. ' f'Is the delimiter configured correctly in the sources config?' ) # add data from source to attribute logger.debug( f'Adding data for attribute {attribute.name} from source {source.file}:{source_column}' ) source_id_column = source_file_id_mapping[source.file] for entity_id, entity in entity_data.items(): if entity.get(attribute.name) is not None: continue source_records = list([ record for record in source_data[source.file] if record[source_id_column] == entity_id ]) if not source_records: continue if len(source_records) > 1: raise DataException( f'Multiple records for {entity_type.__name__}' f' with id {entity_id} in file {source.file}') value = source_records[0][source_column] if value == '': value = None if value is not None and source.date_format is not None: try: value = datetime.strptime(value, source.date_format) except Exception as e: logger.error(e) raise DataException( f'Error parsing {attribute.name} from' f' source {source.file}:{source_column} with id {entity_id}' ) entity[attribute.name] = value logger.debug(f'{entity_type.__name__} entity data: {entity_data}') try: return transform_entities(entity_data.values(), entity_type.schema(), lambda e: entity_type(**e)) except DataException as e: logger.error( f'Please check source files: {", ".join(source_files)}') raise e