def read_source_file_data(self, source_file) -> Sequence[Dict[str, Any]]: file_format = self.sources_config.file_format.get(source_file, None)\ if self.sources_config.file_format else None if file_format is not None: reader = TabularFileReader(path.join(self.input_dir, source_file), file_format.delimiter) else: reader = TabularFileReader(path.join(self.input_dir, source_file)) source_file_data = reader.read_data() if self.sources_config.codebooks is not None: codebook_filename = self.sources_config.codebooks.get(source_file, None) if codebook_filename is not None: codebook_mapper = CodeBookMapper(path.join(self.input_dir, codebook_filename)) source_file_data = codebook_mapper.apply(source_file_data) return source_file_data
def read_data(self, filename: str) -> Optional[Sequence[NGS]]: """ Reads .txt file. Sample_id should be specified in the header. Assumes that all column names except Gene Symbol, Gene ID, Locus ID and Cytoband are sample identifiers. :param filename: name of the input file :return: Sequence of NGS objects """ data = TabularFileReader(os.path.join(self.input_dir, filename)).read_data() biosource_biomaterial_dict = dict() if data: sample_id_col_num = 0 for col_value in data[0]: if col_value not in [ 'Gene Symbol', 'Gene ID', 'Locus ID', 'Cytoband' ]: sample_id_col_num += 1 biosource_biomaterial = self.biosource_biomaterial_from_sample_id( col_value, filename) biosource_biomaterial_dict.setdefault( biosource_biomaterial[0], []).append(biosource_biomaterial[1]) if sample_id_col_num == 0: raise ReaderException( "Cannot read NGS data from file: {}. No sample_id found in header" .format(filename)) else: raise ReaderException( "Cannot read NGS data from file: {}. Empty data.".format( filename)) return self.map_ngs(biosource_biomaterial_dict, filename)
def read_data(self, filename: str) -> Optional[Sequence[NGS]]: """ Reads .maf.gz. file. Sample_id should be specified in the :attr:`self.sample_id_column_name` column. :param filename: name of the input file :return: Sequence of NGS objects """ data = TabularFileReader(os.path.join(self.input_dir, filename)).read_data() biosource_biomaterial_dict = dict() if len(data) > 1: for row in data: try: col_value = row[self.sample_id_column_name] except KeyError: raise ReaderException( "Invalid {} file. No column with name {}. Cannot read sample ids." .format(filename, self.sample_id_column_name)) biosource_biomaterial = self.biosource_biomaterial_from_sample_id( col_value, filename) biosource_biomaterial_dict.setdefault( biosource_biomaterial[0], []).append(biosource_biomaterial[1]) else: raise ReaderException( "Cannot read NGS data from file: {}. Empty data.".format( filename)) return self.map_ngs(biosource_biomaterial_dict, filename)
def read_data(self, filename: str) -> Optional[Sequence[NGS]]: """ Reads .txt file. Sample_id should be specified in the header. Assumes that the IDs will start with 'PMC'. :param filename: name of the input file :return: Sequence of NGS objects """ data = TabularFileReader(os.path.join(self.input_dir, filename)).read_data() biosource_biomaterial_dict = dict() if data: sample_id_col_num = 0 for col_value in data[0]: if col_value.startswith('PMC'): sample_id_col_num += 1 biosource_biomaterial = self.biosource_biomaterial_from_sample_id( col_value, filename) biosource_biomaterial_dict.setdefault( biosource_biomaterial[0], []).append(biosource_biomaterial[1]) if sample_id_col_num == 0: raise ReaderException( "Cannot read NGS data from file: {}. No sample_id found in header" .format(filename)) else: raise ReaderException( "Cannot read NGS data from file: {}. Empty data.".format( filename)) return self.map_ngs(biosource_biomaterial_dict, filename)
def test_transformation(tmp_path): target_path = tmp_path.as_posix() runner = CliRunner() result = runner.invoke(sources2csr.run, [ './test_data/input_data/CLINICAL', target_path, './test_data/input_data/config' ]) assert result.exit_code == 0 assert path.exists(target_path + '/individual.tsv') assert path.exists(target_path + '/diagnosis.tsv') assert path.exists(target_path + '/biosource.tsv') assert path.exists(target_path + '/biomaterial.tsv') assert path.exists(target_path + '/study.tsv') assert path.exists(target_path + '/individual_study.tsv') # test if codebook mapping has been applied individual_data = TabularFileReader(path.join(target_path, 'individual.tsv')).read_data() p1 = [ind for ind in individual_data if ind['individual_id'] == 'P1'][0] assert p1['gender'] == 'female' # test if derived values have been calculated, if not read from the source assert p1['diagnosis_count'] == '2' assert p1['age_first_diagnosis'] == '23' # 01-05-2016 - 01-02-1993 # test if aggregate values have been correctly inserted from the source p2 = [ind for ind in individual_data if ind['individual_id'] == 'P2'][0] assert p2['diagnosis_count'] == '4' assert p2['age_first_diagnosis'] == '50' # check if data from second input file is included p2 = [ind for ind in individual_data if ind['individual_id'] == 'P2'][0] assert p2['ic_withdrawn_date'] == '2018-06-02' # check if data from higher priority files are not overwritten p6 = [ind for ind in individual_data if ind['individual_id'] == 'P6'][0] assert p6['ic_withdrawn_date'] == '2017-10-14' biosource_data = TabularFileReader(path.join(target_path, 'biosource.tsv')).read_data() # test reading of biosources from CSV file bs1 = [biosource for biosource in biosource_data if biosource['biosource_id'] == 'BS1'][0] assert bs1['tissue'] == 'medula' assert bs1['biosource_date'] == '2017-03-12' assert bs1['tumor_percentage'] == '5'
def read_data(self, filename: str) -> Optional[Sequence[NGS]]: """ Reads .seg file as tab separated file. Sample ID should be specified in the first column. :param filename: name of the input file :return: Sequence of NGS objects """ data = TabularFileReader(os.path.join(self.input_dir, filename)).read_data() biosource_biomaterial_dict = dict() if len(data) > 1: for row in data: sample_id = list(row.values())[0] biosource_biomaterial = self.biosource_biomaterial_from_sample_id(sample_id, filename) biosource_biomaterial_dict.setdefault(biosource_biomaterial[0], []).append(biosource_biomaterial[1]) else: raise ReaderException("Cannot read NGS data from file: {}. Empty data.".format(filename)) return self.map_ngs(biosource_biomaterial_dict, filename)
def read_entities(self, file_path: str, entity_type: Type[BaseModel]) -> List[Any]: try: data = TabularFileReader(file_path).read_data() except FileNotFoundError: return [] date_fields = self.get_date_fields(entity_type.schema()) array_fields = self.get_array_fields(entity_type.schema()) for row in data: for field, value in row.items(): if value == '' or value == 'NA': row[field] = None elif field in date_fields: row[field] = datetime.strptime(value, '%Y-%m-%d') elif field in array_fields: row[field] = json.loads(value) return [entity_type(**d) for d in list(data)]