def read_configuration(config_dir) -> SourcesConfig:
    """ Parse configuration files and return set of dictionaries

    :param config_dir: Path to directory where the configs are stored
    """
    sources_config_path = path.join(config_dir, 'sources_config.json')
    if not path.exists(sources_config_path) or not path.isfile(sources_config_path):
        raise DataException(f'Cannot find {sources_config_path}')
    with open(sources_config_path, 'r') as sources_config_file:
        try:
            config_data = json.load(sources_config_file)
        except Exception as e:
            logger.error(e)
            raise DataException(f'Error parsing source config file: {sources_config_path}')
        return SourcesConfig(**config_data)
def validate_derived_values_not_in_source_config(entity_type: BaseModel, entity_source_config: Entity):
    derived_properties = set([name for (name, prop) in entity_type.schema()['properties'].items()
                              if 'derived' in prop and prop['derived'] is True])
    attribute_names = set([attr.name for attr in entity_source_config.attributes])
    intersection = derived_properties.intersection(attribute_names)
    if intersection:
        raise DataException(f'Derived value fields not allowed in source files: {", ".join(intersection)}')
 def read_id_property(self, entity_type) -> str:
     entity_sources_config = self.sources_config.entities[entity_type.__name__]
     source_columns = list([attribute.name for attribute in entity_sources_config.attributes])
     logger.debug(f'Source columns: {source_columns}')
     schema = entity_type.schema()
     schema_columns = list(schema['properties'].keys())
     logger.debug(f'Schema columns: {schema_columns}')
     invalid_columns = set(source_columns) - set(schema_columns)
     if invalid_columns:
         raise DataException(f'Unknown columns in source configuration: {invalid_columns}')
     return get_id_property(schema)
Пример #4
0
 def validate_molecule_type_agrees_with_library_strategy(
         cls, library_strategy, values):
     if 'type' in values and library_strategy is not None:
         if values['type'] == 'DNA' and library_strategy.__contains__(
                 'RNA-Seq'):
             raise DataException(
                 f'Not allowed RNA-Seq library strategy for molecule type: DNA'
             )
         if values['type'] == 'RNA' and library_strategy.__contains__(
                 'WXS'):
             raise DataException(
                 f'Not allowed WXS library strategy for molecule type: RNA')
         if values['type'] == 'RNA' and library_strategy.__contains__(
                 'WGS'):
             raise DataException(
                 f'Not allowed WGS library strategy for molecule type: RNA')
         if values['type'] == 'RNA' and library_strategy.__contains__(
                 'DNA-meth_array'):
             raise DataException(
                 f'Not allowed DNA-meth_array library strategy for molecule type: RNA'
             )
     return library_strategy
def transform_entities(entities: Any, schema: Dict, constructor: Any):
    id_property = get_id_property(schema)
    entities = [transform_entity(entity_data, schema) for entity_data in entities]
    result = []
    for entity in entities:
        try:
            result.append(constructor(entity))
        except Exception as e:
            logger.error(e)
            entity_name = schema['title']
            id = entity[id_property]
            raise DataException(f'Invalid data for {entity_name} with id {id}')
    return result
def get_source_files(entity_sources_config: Entity, id_property: str):
    source_files = set([source.file
                        for attribute in entity_sources_config.attributes
                        for source in attribute.sources])
    source_file_id_mapping = dict([(source.file, source.column if source.column is not None else attribute.name)
                                   for attribute in entity_sources_config.attributes
                                   for source in attribute.sources
                                   if attribute.name == id_property])
    logger.debug(f'Source files: {source_files}')
    logger.debug(f'Source file id mapping: {source_file_id_mapping}')
    source_files_without_id_column = source_files - set(source_file_id_mapping.keys())
    if source_files_without_id_column:
        raise DataException(f'Id column missing in source files: {source_files_without_id_column}')
    return source_files, source_file_id_mapping
Пример #7
0
def process_mutation_data(ngs_dir: str, output_dir: str,
                          clinical_sample_ids: List[str]) -> Set[str]:
    """
    Reads data from all MAF files in ngs_dir and creates a combined mutation data file
    and the meta and caselist files for the mutation data.

    Returns the list of sample identifiers found in the mutation data.
    An exception is raised when any of the sample identifiers is not in the clinical_sample_ids list.
    """
    mutation_samples = combine_maf(
        ngs_dir, os.path.join(output_dir, 'data_mutations.maf'))

    if mutation_samples:
        # Create meta file
        meta_filename = os.path.join(output_dir, 'meta_mutations.txt')
        create_meta_content(meta_filename,
                            cancer_study_identifier=STUDY_ID,
                            genetic_alteration_type='MUTATION_EXTENDED',
                            datatype='MAF',
                            stable_id='mutations',
                            show_profile_in_analysis_tab='true',
                            profile_name='Mutations',
                            profile_description='Mutation data',
                            data_filename='data_mutations.maf',
                            variant_classification_filter='',
                            swissprot_identifier='accession')

        # Create case list
        create_caselist(output_dir=output_dir,
                        file_name='cases_sequenced.txt',
                        cancer_study_identifier=STUDY_ID,
                        stable_id='%s_sequenced' % STUDY_ID,
                        case_list_name='Sequenced samples',
                        case_list_description='All sequenced samples',
                        case_list_category='all_cases_with_mutation_data',
                        case_list_ids="\t".join(mutation_samples))

        # Test for samples in MAF files that are not in clinical data
        if not set(clinical_sample_ids).issuperset(mutation_samples):
            logger.error(
                "Found samples in MAF files that are not in clinical data: {}".
                format(", ".join(
                    mutation_samples.difference(set(clinical_sample_ids)))))
            raise DataException(
                'Found samples in MAF files that are not in clinical data')
    return mutation_samples
Пример #8
0
def read_configuration(config_dir) -> OntologyConfig:
    """ Parse configuration files and return set of dictionaries

    :param config_dir: Path to directory where the configs are stored
    """
    ontology_config_path = path.join(config_dir, 'ontology_config.json')
    if not path.exists(ontology_config_path) or not path.isfile(
            ontology_config_path):
        raise FileNotFoundError(f'Cannot find {ontology_config_path}')
    with open(ontology_config_path, 'r') as ontology_config_file:
        try:
            config_data = json.load(ontology_config_file)
        except Exception as e:
            logger.error(e)
            raise DataException(
                f'Error parsing ontology config file: {ontology_config_path}')
        return OntologyConfig(**config_data)
Пример #9
0
def process_cna_files(ngs_dir: str, output_dir: str,
                      clinical_sample_ids: List[str]) -> List[str]:
    """
    Reads CNA data files (segmented, continuous and discrete) from ngs_dir, copies the files,
    drops and renames certain columns, and writes meta and case list files.

    Returns list of CNA sample identifiers.
    An exception is raised when any of the sample identifiers is not in the clinical_sample_ids list.
    """
    # Select all non-hidden files
    study_files = []
    for study_file in os.listdir(ngs_dir):
        if not study_file.startswith('.'):
            study_files.append(study_file)

    # Create sample list, required for cnaseq case list
    cna_samples = []

    # Transform CNA data files
    for study_file in study_files:
        if study_file.endswith('sha1'):
            continue

        # CNA Segment data
        if study_file.split('.')[-1] == 'seg':
            logger.debug('Transforming segment data: %s' % study_file)

            output_file = 'data_cna_segments.seg'

            # Read file and replace header
            with open(os.path.join(ngs_dir, study_file)) as segment_file:
                segment_lines = segment_file.readlines()
                segment_lines[
                    0] = 'ID	chrom	loc.start	loc.end	num.mark	seg.mean\n'

            # Write a copy with replaced header
            with open(os.path.join(output_dir, output_file),
                      'w') as segment_file:
                segment_file.writelines(segment_lines)

            # Create meta file
            meta_filename = os.path.join(output_dir, 'meta_cna_segments.txt')
            create_meta_content(
                meta_filename,
                cancer_study_identifier=STUDY_ID,
                genetic_alteration_type='COPY_NUMBER_ALTERATION',
                datatype='SEG',
                reference_genome_id='hg38',
                description='Segment data',
                data_filename=output_file)

        # CNA Continuous
        elif 'data_by_genes' in study_file:
            logger.debug('Transforming continuous CNA data: %s' % study_file)

            output_file = 'data_cna_continuous.txt'

            # Read file
            cna_data = pd.read_csv(os.path.join(ngs_dir, study_file),
                                   sep='\t',
                                   na_values=[''],
                                   dtype={'Gene ID': str})
            # Remove column and rename column names
            cna_data.drop('Cytoband', axis=1, inplace=True)
            cna_data.rename(columns={
                'Gene Symbol': 'Hugo_Symbol',
                'Gene ID': 'Entrez_Gene_Id'
            },
                            inplace=True)

            # Remove negative Entrez IDs. This can lead to incorrect mapping in cBioPortal
            for index, row in cna_data.iterrows():
                if int(row['Entrez_Gene_Id']) < -1:
                    cna_data.loc[index, 'Entrez_Gene_Id'] = ''
            cna_data.to_csv(os.path.join(output_dir, output_file),
                            sep='\t',
                            index=False,
                            header=True)

            # Create meta file
            meta_filename = os.path.join(output_dir, 'meta_cna_continuous.txt')
            create_meta_content(
                meta_filename,
                cancer_study_identifier=STUDY_ID,
                genetic_alteration_type='COPY_NUMBER_ALTERATION',
                datatype='LOG2-VALUE',
                stable_id='log2CNA',
                show_profile_in_analysis_tab='false',
                profile_name='Copy-number alteration values',
                profile_description=
                'Continuous copy-number alteration values for each gene.',
                data_filename=output_file)

            # Create case list
            cna_samples = cna_data.columns[2:].tolist()
            create_caselist(output_dir=output_dir,
                            file_name='cases_cna.txt',
                            cancer_study_identifier=STUDY_ID,
                            stable_id='%s_cna' % STUDY_ID,
                            case_list_name='CNA samples',
                            case_list_description='All CNA samples',
                            case_list_category='all_cases_with_cna_data',
                            case_list_ids="\t".join(cna_samples))

            # Test for samples in CNA files that are not in clinical data
            if not set(clinical_sample_ids).issuperset(set(cna_samples)):
                logger.error(
                    "Found samples in CNA files that are not in clinical data: {}"
                    .format(", ".join(
                        set(cna_samples).difference(
                            set(clinical_sample_ids)))))
                raise DataException(
                    'Found samples in CNA files that are not in clinical data')

        # CNA Discrete
        elif 'thresholded.by_genes' in study_file:
            logger.debug('Transforming discrete CNA data: %s' % study_file)

            output_file = 'data_cna_discrete.txt'

            # Read input file
            cna_data = pd.read_csv(os.path.join(ngs_dir, study_file),
                                   sep='\t',
                                   na_values=[''],
                                   dtype={'Gene ID': str})
            # Remove column and rename column names
            cna_data.drop('Cytoband', axis=1, inplace=True)
            cna_data.rename(columns={
                'Gene Symbol': 'Hugo_Symbol',
                'Locus ID': 'Entrez_Gene_Id'
            },
                            inplace=True)

            # Remove negative Entrez IDs. This can lead to incorrect mapping in cBioPortal
            for index, row in cna_data.iterrows():
                if int(row['Entrez_Gene_Id']) < -1:
                    cna_data.loc[index, 'Entrez_Gene_Id'] = ''
            cna_data.to_csv(os.path.join(output_dir, output_file),
                            sep='\t',
                            index=False,
                            header=True)

            # Create meta file
            meta_filename = os.path.join(output_dir, 'meta_cna_discrete.txt')
            profile_description = 'Putative copy-number alteration values for each gene from GISTIC 2.0.' \
                                  'Values: -2 = homozygous deletion; -1 = hemizygous deletion;' \
                                  '0 = neutral / no change; 1 = gain; 2 = high level amplification.'

            create_meta_content(
                meta_filename,
                cancer_study_identifier=STUDY_ID,
                genetic_alteration_type='COPY_NUMBER_ALTERATION',
                datatype='DISCRETE',
                stable_id='gistic',
                show_profile_in_analysis_tab='true',
                profile_name='Putative copy-number alterations from GISTIC',
                profile_description=profile_description,
                data_filename=output_file)
        elif study_file.split('.')[-2:] == ['maf', 'gz']:
            # Mutations file are transformed in an other loop
            pass

        else:
            logger.warning("Unknown file type: %s" % study_file)
    return cna_samples
Пример #10
0
 def check_self_reference(cls, src_biomaterial_id, values):
     if src_biomaterial_id == values['biomaterial_id']:
         raise DataException(f'Biomaterial cannot be derived from itself')
     return src_biomaterial_id
Пример #11
0
def read_codebook(codebook_filename: str) -> CodeBook:
    """Process the content of a codebook and return the reformatted codebook as an object.
    Expected import format is a tab-delimited file.

    Format:
    - A header line is followed by one or more value mapping lines.
    - Header lines contain two columns: a number and a space separated list of column names. E.g., "1\tSEX GENDER".
    - Value mapping lines start with an empty field, followed by code, value pairs. E.g., "\t1\tMale\t2\tFemale".

    :param codebook_filename: file name of the code book
    :return: code book object
    """
    with open(codebook_filename, 'r') as code_book_file:
        column_mappings: Dict[str, ColumnValueMapping] = {}
        current_value_mapping: Dict[Any, Any] = {}
        current_columns: Optional[List[str]] = None
        line_number = 0
        for line in code_book_file:
            line_number += 1
            line = line.strip('\n')
            if not line.startswith('\t'):
                # Save previous column mapping
                if current_columns is not None:
                    column_value_mapping = ColumnValueMapping(
                        value_mapping=current_value_mapping)
                    for column in current_columns:
                        column_mappings[column] = column_value_mapping
                # Start new column mapping
                tokens = line.split('\t')
                if len(tokens) < 2:
                    raise DataException(
                        f'Invalid header in codebook {codebook_filename} on line {line_number}'
                    )
                current_columns = [
                    column_name.lower() for column_name in tokens[1].split(' ')
                ]
                duplicate_columns = set(current_columns).intersection(
                    column_mappings.keys())
                if duplicate_columns:
                    raise DataException(
                        f'Duplicate columns in codebook on line {line_number}: {", ".join(duplicate_columns)}'
                    )
                current_value_mapping = {}
            else:
                # Add values to current value mapping
                tokens = line.split('\t')[1:]
                if len(tokens) % 2 != 0:
                    raise DataException(
                        f'Invalid value mapping in codebook {codebook_filename} on line {line_number}'
                    )
                it = iter(tokens)
                for code, value in zip(it, it):
                    if code != '' and value != '':
                        value = value.replace('"', '')
                        if code in current_value_mapping:
                            raise DataException(
                                f'Duplicate code in codebook on line {line_number}: {code}'
                            )
                        current_value_mapping[code] = value
        # Save last column mapping
        if current_columns is not None:
            column_value_mapping = ColumnValueMapping(
                value_mapping=current_value_mapping)
            for column in current_columns:
                column_mappings[column] = column_value_mapping
        return CodeBook(column_mappings=column_mappings)
    def read_entity_data(self, entity_type) -> Sequence:
        """
        Reads data for an entity type from the source files that are specified
        in the sources config file.

        :param entity_type: the entity type, e.g., Individual.
        :return: A sequence of entities.
        """
        logger.info(f'* Reading {entity_type.__name__} data ...')
        if entity_type.__name__ not in self.sources_config.entities:
            raise DataException(
                f'No source configuration found for the {entity_type.__name__} entity'
            )

        id_property = self.read_id_property(entity_type)

        entity_sources_config = self.sources_config.entities[
            entity_type.__name__]

        validate_derived_values_not_in_source_config(entity_type,
                                                     entity_sources_config)

        source_files, source_file_id_mapping = get_source_files(
            entity_sources_config, id_property)

        # Read data from source files
        source_data = {}
        entity_data = {}
        for source_file in source_files:
            source_file_data = self.read_source_file_data(source_file)
            if len(source_file_data) == 0:
                raise DataException(f'No records in {source_file}')
            source_id_column = source_file_id_mapping[source_file]
            record_number = 0
            item_ids = set()
            for item in source_file_data:
                record_number += 1
                if source_id_column not in item.keys():
                    raise DataException(
                        f'Identifier column \'{source_id_column}\' not found in file {source_file}. '
                        f'Is the delimiter configured correctly in the sources config?'
                    )
                item_id = item.get(source_id_column, None)
                if item_id is None or item_id == '':
                    raise DataException(
                        f'Empty identifier in {source_file} record number {record_number}'
                    )
                if item_id in item_ids:
                    raise DataException(
                        f'Duplicate identifier in {source_file} record number {record_number}'
                    )
                item_ids.add(item_id)
                if item_id not in entity_data:
                    entity_data[item_id] = {id_property: item_id}
            source_data[source_file] = source_file_data

        logger.debug(f'{entity_type.__name__} entity data: {entity_data}')

        # Merge data from different sources files
        for attribute in entity_sources_config.attributes:
            if attribute.name == id_property:
                continue
            # add data to entities for attribute
            logger.debug(f'Adding data for attribute {attribute.name}')
            for source in attribute.sources:
                # default column name is the attribute name
                source_column = source.column if source.column is not None else attribute.name
                # check if column is in the source data
                first_record = source_data[source.file][0]
                if source_column not in first_record.keys():
                    raise DataException(
                        f'Column \'{source_column}\' not found in file {source.file}. '
                        f'Is the delimiter configured correctly in the sources config?'
                    )
                # add data from source to attribute
                logger.debug(
                    f'Adding data for attribute {attribute.name} from source {source.file}:{source_column}'
                )
                source_id_column = source_file_id_mapping[source.file]
                for entity_id, entity in entity_data.items():
                    if entity.get(attribute.name) is not None:
                        continue
                    source_records = list([
                        record for record in source_data[source.file]
                        if record[source_id_column] == entity_id
                    ])
                    if not source_records:
                        continue
                    if len(source_records) > 1:
                        raise DataException(
                            f'Multiple records for {entity_type.__name__}'
                            f' with id {entity_id} in file {source.file}')
                    value = source_records[0][source_column]
                    if value == '':
                        value = None
                    if value is not None and source.date_format is not None:
                        try:
                            value = datetime.strptime(value,
                                                      source.date_format)
                        except Exception as e:
                            logger.error(e)
                            raise DataException(
                                f'Error parsing {attribute.name} from'
                                f' source {source.file}:{source_column} with id {entity_id}'
                            )
                    entity[attribute.name] = value

        logger.debug(f'{entity_type.__name__} entity data: {entity_data}')

        try:
            return transform_entities(entity_data.values(),
                                      entity_type.schema(),
                                      lambda e: entity_type(**e))
        except DataException as e:
            logger.error(
                f'Please check source files: {", ".join(source_files)}')
            raise e