def get_formatted_scans_key_row(item): """ Parameters ---------- item Returns ------- row: list [ISO acquisition time, performing physician name, random string] """ dcm_fn = item[-1][0] from heudiconv.external.dcmstack import ds mw = ds.wrapper_from_data( dcm.read_file(dcm_fn, stop_before_pixels=True, force=True)) # we need to store filenames and acquisition times # parse date and time and get it into isoformat date = mw.dcm_data.ContentDate time = mw.dcm_data.ContentTime.split('.')[0] td = time + date acq_time = datetime.strptime(td, '%H%M%S%Y%m%d').isoformat() # add random string randstr = ''.join(map(chr, sample(k=8, population=range(33, 127)))) try: perfphys = mw.dcm_data.PerformingPhysicianName except AttributeError: perfphys = '' row = [acq_time, perfphys, randstr] # empty entries should be 'n/a' # https://github.com/dartmouth-pbs/heudiconv/issues/32 row = ['n/a' if not str(e) else e for e in row] return row
def test_grouping(tmpdir, subject): dicoms = [ op.join(TESTS_DATA_PATH, fl) for fl in ['axasc35.dcm', 'phantom.dcm'] ] # ensure DICOMs are different studies studyuids = { dcm.read_file(fl, stop_before_pixels=True).StudyInstanceUID for fl in dicoms } assert len(studyuids) == len(dicoms) # symlink to common location outdir = tmpdir.mkdir('out') datadir = tmpdir.mkdir(subject) for fl in dicoms: os.symlink(fl, (datadir / op.basename(fl)).strpath) template = op.join("{subject}/*.dcm") hargs = gen_heudiconv_args(tmpdir.strpath, outdir.strpath, subject, 'convertall.py', template=template) with pytest.raises(AssertionError): runner(hargs) # group all found DICOMs under subject, despite conflicts hargs += ["-g", "all"] runner(hargs) assert len([fl for fl in outdir.visit(fil='run0*')]) == 4 tsv = (outdir / 'participants.tsv') assert tsv.check() lines = tsv.open().readlines() assert len(lines) == 2 assert lines[1].split('\t')[0] == 'sub-{}'.format(subject)
def test_private_csa_header(tmpdir): dcm_file = op.join(TESTS_DATA_PATH, 'axasc35.dcm') dcm_data = dcm.read_file(dcm_file, stop_before_pixels=True) for pub, priv in DICOM_FIELDS_TO_TEST.items(): # ensure missing public tag with pytest.raises(AttributeError): dcm.pub # ensure private tag is found assert parse_private_csa_header(dcm_data, pub, priv) != '' # and quickly run heudiconv with no conversion runner(['--files', dcm_file, '-c' 'none', '-f', 'reproin'])
def get_dicom_series_time(dicom_list): """Get time in seconds since epoch from dicom series date and time Primarily to be used for reproducible time stamping """ import time import calendar dicom = dcm.read_file(dicom_list[0], stop_before_pixels=True, force=True) dcm_date = dicom.SeriesDate # YYYYMMDD dcm_time = dicom.SeriesTime # HHMMSS.MICROSEC dicom_time_str = dcm_date + dcm_time.split('.', 1)[0] # YYYYMMDDHHMMSS # convert to epoch return calendar.timegm(time.strptime(dicom_time_str, '%Y%m%d%H%M%S'))
def get_dicom_series_time(dicom_list): """Get time in seconds since epoch from dicom series date and time Primarily to be used for reproducible time stamping """ import time import calendar dicom = dcm.read_file(dicom_list[0], stop_before_pixels=True, force=True) dcm_date = dicom.SeriesDate # YYYYMMDD dcm_time = dicom.SeriesTime # HHMMSS.MICROSEC dicom_time_str = dcm_date + dcm_time.split('.', 1)[0] # YYYYMMDDHHMMSS # convert to epoch return calendar.timegm(time.strptime(dicom_time_str, '%Y%m%d%H%M%S'))
def get_formatted_scans_key_row(dcm_fn): """ Parameters ---------- item Returns ------- row: list [ISO acquisition time, performing physician name, random string] """ dcm_data = dcm.read_file(dcm_fn, stop_before_pixels=True, force=True) # we need to store filenames and acquisition times # parse date and time and get it into isoformat try: date = dcm_data.ContentDate time = dcm_data.ContentTime.split('.')[0] td = time + date acq_time = datetime.strptime(td, '%H%M%S%Y%m%d').isoformat() except AttributeError as exc: lgr.warning("Failed to get date/time for the content: %s", str(exc)) acq_time = None # add random string # But let's make it reproducible by using all UIDs # (might change across versions?) randcontent = u''.join([ getattr(dcm_data, f) or '' for f in sorted(dir(dcm_data)) if f.endswith('UID') ]) randstr = hashlib.md5(randcontent.encode()).hexdigest()[:8] try: perfphys = dcm_data.PerformingPhysicianName except AttributeError: perfphys = '' row = [acq_time, perfphys, randstr] # empty entries should be 'n/a' # https://github.com/dartmouth-pbs/heudiconv/issues/32 row = ['n/a' if not str(e) else e for e in row] return row
def get_formatted_scans_key_row(item): """ Parameters ---------- item Returns ------- row: list [ISO acquisition time, performing physician name, random string] """ dcm_fn = item[-1][0] from heudiconv.external.dcmstack import ds mw = ds.wrapper_from_data(dcm.read_file(dcm_fn, stop_before_pixels=True, force=True)) # we need to store filenames and acquisition times # parse date and time and get it into isoformat try: date = mw.dcm_data.ContentDate time = mw.dcm_data.ContentTime.split('.')[0] td = time + date acq_time = datetime.strptime(td, '%H%M%S%Y%m%d').isoformat() except AttributeError as exc: lgr.warning("Failed to get date/time for the content: %s", str(exc)) acq_time = None # add random string randstr = ''.join(map(chr, sample(k=8, population=range(33, 127)))) try: perfphys = mw.dcm_data.PerformingPhysicianName except AttributeError: perfphys = '' row = [acq_time, perfphys, randstr] # empty entries should be 'n/a' # https://github.com/dartmouth-pbs/heudiconv/issues/32 row = ['n/a' if not str(e) else e for e in row] return row
def group_dicoms_into_seqinfos(files, file_filter, dcmfilter, grouping): """Process list of dicoms and return seqinfo and file group `seqinfo` contains per-sequence extract of fields from DICOMs which will be later provided into heuristics to decide on filenames Parameters ---------- files : list of str List of files to consider file_filter : callable, optional Applied to each item of filenames. Should return True if file needs to be kept, False otherwise. dcmfilter : callable, optional If called on dcm_data and returns True, it is used to set series_id grouping : {'studyUID', 'accession_number', None}, optional what to group by: studyUID or accession_number Returns ------- seqinfo : list of list `seqinfo` is a list of info entries per each sequence (some entry there defines a key for `filegrp`) filegrp : dict `filegrp` is a dictionary with files groupped per each sequence """ allowed_groupings = ['studyUID', 'accession_number', None] if grouping not in allowed_groupings: raise ValueError('I do not know how to group by {0}'.format(grouping)) per_studyUID = grouping == 'studyUID' per_accession_number = grouping == 'accession_number' lgr.info("Analyzing %d dicoms", len(files)) groups = [[], []] mwgroup = [] studyUID = None # for sanity check that all DICOMs came from the same # "study". If not -- what is the use-case? (interrupted acquisition?) # and how would then we deal with series numbers # which would differ already if file_filter: nfl_before = len(files) files = list(filter(file_filter, files)) nfl_after = len(files) lgr.info('Filtering out {0} dicoms based on their filename'.format( nfl_before-nfl_after)) for fidx, filename in enumerate(files): from heudiconv.external.dcmstack import ds # TODO after getting a regression test check if the same behavior # with stop_before_pixels=True mw = ds.wrapper_from_data(dcm.read_file(filename, force=True)) for sig in ('iop', 'ICE_Dims', 'SequenceName'): try: del mw.series_signature[sig] except: pass try: file_studyUID = mw.dcm_data.StudyInstanceUID except AttributeError: lgr.info("File {} is missing any StudyInstanceUID".format(filename)) file_studyUID = None try: series_id = (int(mw.dcm_data.SeriesNumber), mw.dcm_data.ProtocolName) file_studyUID = mw.dcm_data.StudyInstanceUID if not per_studyUID: # verify that we are working with a single study if studyUID is None: studyUID = file_studyUID elif not per_accession_number: assert studyUID == file_studyUID, ( "Conflicting study identifiers found [{}, {}].".format( studyUID, file_studyUID )) except AttributeError as exc: lgr.warning('Ignoring %s since not quite a "normal" DICOM: %s', filename, exc) series_id = (-1, 'none') file_studyUID = None if not series_id[0] < 0: if dcmfilter is not None and dcmfilter(mw.dcm_data): series_id = (-1, mw.dcm_data.ProtocolName) # filter out unwanted non-image-data DICOMs by assigning # a series number < 0 (see test below) if not series_id[0] < 0 and mw.dcm_data[0x0008, 0x0016].repval in ( 'Raw Data Storage', 'GrayscaleSoftcopyPresentationStateStorage'): series_id = (-1, mw.dcm_data.ProtocolName) if per_studyUID: series_id = series_id + (file_studyUID,) ingrp = False for idx in range(len(mwgroup)): # same = mw.is_same_series(mwgroup[idx]) if mw.is_same_series(mwgroup[idx]): # the same series should have the same study uuid assert (mwgroup[idx].dcm_data.get('StudyInstanceUID', None) == file_studyUID) ingrp = True if series_id[0] >= 0: series_id = (mwgroup[idx].dcm_data.SeriesNumber, mwgroup[idx].dcm_data.ProtocolName) if per_studyUID: series_id = series_id + (file_studyUID,) groups[0].append(series_id) groups[1].append(idx) if not ingrp: mwgroup.append(mw) groups[0].append(series_id) groups[1].append(len(mwgroup) - 1) group_map = dict(zip(groups[0], groups[1])) total = 0 seqinfo = OrderedDict() # for the next line to make any sense the series_id needs to # be sortable in a way that preserves the series order for series_id, mwidx in sorted(group_map.items()): if series_id[0] < 0: # skip our fake series with unwanted files continue mw = mwgroup[mwidx] if mw.image_shape is None: # this whole thing has now image data (maybe just PSg DICOMs) # nothing to see here, just move on continue dcminfo = mw.dcm_data series_files = [files[i] for i, s in enumerate(groups[0]) if s == series_id] # turn the series_id into a human-readable string -- string is needed # for JSON storage later on if per_studyUID: studyUID = series_id[2] series_id = series_id[:2] accession_number = dcminfo.get('AccessionNumber') series_id = '-'.join(map(str, series_id)) size = list(mw.image_shape) + [len(series_files)] total += size[-1] if len(size) < 4: size.append(1) # MG - refactor into util function try: TR = float(dcminfo.RepetitionTime) / 1000. except (AttributeError, ValueError): TR = -1 try: TE = float(dcminfo.EchoTime) except (AttributeError, ValueError): TE = -1 try: refphys = str(dcminfo.ReferringPhysicianName) except AttributeError: refphys = '' try: image_type = tuple(dcminfo.ImageType) except AttributeError: image_type = '' try: series_desc = dcminfo.SeriesDescription except AttributeError: series_desc = '' motion_corrected = 'MOCO' in image_type if dcminfo.get([0x18,0x24], None): # GE and Philips scanners sequence_name = dcminfo[0x18,0x24].value elif dcminfo.get([0x19, 0x109c], None): # Siemens scanners sequence_name = dcminfo[0x19, 0x109c].value else: sequence_name = 'Not found' info = SeqInfo( total, op.split(series_files[0])[1], series_id, op.basename(op.dirname(series_files[0])), '-', '-', size[0], size[1], size[2], size[3], TR, TE, dcminfo.ProtocolName, motion_corrected, 'derived' in [x.lower() for x in dcminfo.get('ImageType', [])], dcminfo.get('PatientID'), dcminfo.get('StudyDescription'), refphys, dcminfo.get('SeriesDescription'), sequence_name, image_type, accession_number, # For demographics to populate BIDS participants.tsv dcminfo.get('PatientAge'), dcminfo.get('PatientSex'), dcminfo.get('AcquisitionDate'), ) # candidates # dcminfo.AccessionNumber # len(dcminfo.ReferencedImageSequence) # len(dcminfo.SourceImageSequence) # FOR demographics if per_studyUID: key = studyUID.split('.')[-1] elif per_accession_number: key = accession_number else: key = '' lgr.debug("%30s %30s %27s %27s %5s nref=%-2d nsrc=%-2d %s" % ( key, info.series_id, dcminfo.SeriesDescription, dcminfo.ProtocolName, info.is_derived, len(dcminfo.get('ReferencedImageSequence', '')), len(dcminfo.get('SourceImageSequence', '')), info.image_type )) if per_studyUID: if studyUID not in seqinfo: seqinfo[studyUID] = OrderedDict() seqinfo[studyUID][info] = series_files elif per_accession_number: if accession_number not in seqinfo: seqinfo[accession_number] = OrderedDict() seqinfo[accession_number][info] = series_files else: seqinfo[info] = series_files if per_studyUID: lgr.info("Generated sequence info for %d studies with %d entries total", len(seqinfo), sum(map(len, seqinfo.values()))) elif per_accession_number: lgr.info("Generated sequence info for %d accession numbers with %d " "entries total", len(seqinfo), sum(map(len, seqinfo.values()))) else: lgr.info("Generated sequence info with %d entries", len(seqinfo)) return seqinfo
def group_dicoms_into_seqinfos(files, file_filter, dcmfilter, grouping): """Process list of dicoms and return seqinfo and file group `seqinfo` contains per-sequence extract of fields from DICOMs which will be later provided into heuristics to decide on filenames Parameters ---------- files : list of str List of files to consider file_filter : callable, optional Applied to each item of filenames. Should return True if file needs to be kept, False otherwise. dcmfilter : callable, optional If called on dcm_data and returns True, it is used to set series_id grouping : {'studyUID', 'accession_number', None}, optional what to group by: studyUID or accession_number Returns ------- seqinfo : list of list `seqinfo` is a list of info entries per each sequence (some entry there defines a key for `filegrp`) filegrp : dict `filegrp` is a dictionary with files groupped per each sequence """ lgr.info("FFFFFFFFFFFFFFFFFFFFFFFFFf") allowed_groupings = ['studyUID', 'accession_number', None] if grouping not in allowed_groupings: raise ValueError('I do not know how to group by {0}'.format(grouping)) per_studyUID = grouping == 'studyUID' per_accession_number = grouping == 'accession_number' lgr.info("Analyzing %d dicoms", len(files)) grouping = 'accession_number' # tjs temp groups = [[], []] mwgroup = [] studyUID = None # for sanity check that all DICOMs came from the same # "study". If not -- what is the use-case? (interrupted acquisition?) # and how would then we deal with series numbers # which would differ already if file_filter: nfl_before = len(files) files = list(filter(file_filter, files)) nfl_after = len(files) lgr.info('Filtering out {0} dicoms based on their filename'.format( nfl_before - nfl_after)) for fidx, filename in enumerate(files): from heudiconv.external.dcmstack import ds # TODO after getting a regression test check if the same behavior # with stop_before_pixels=True mw = ds.wrapper_from_data(dcm.read_file(filename, force=True)) lgr.info("WWWWWWWWWWWWWWWWWWWWWWw") lgr.info("dicoms.py>group_dicoms_into_seqinfos:vars(mw) var: %s", vars(mw)) for sig in ('iop', 'ICE_Dims', 'SequenceName'): try: del mw.series_signature[sig] except: pass try: file_studyUID = mw.dcm_data.StudyInstanceUID except AttributeError: lgr.info("dicoms.py-group_dicoms_into_seqinfo var: %s", vars(mw)) lgr.info( "File {} is missing any StudyInstanceUID".format(filename)) file_studyUID = None try: series_id = (int(mw.dcm_data.SeriesNumber), mw.dcm_data.ProtocolName) file_studyUID = mw.dcm_data.StudyInstanceUID if not per_studyUID: # verify that we are working with a single study if studyUID is None: studyUID = file_studyUID elif not per_accession_number: assert studyUID == file_studyUID, ( "Conflicting study identifiers found [{}, {}].".format( studyUID, file_studyUID)) except AttributeError as exc: lgr.warning('Ignoring %s since not quite a "normal" DICOM: %s', filename, exc) series_id = (-1, 'none') file_studyUID = None if not series_id[0] < 0: if dcmfilter is not None and dcmfilter(mw.dcm_data): series_id = (-1, mw.dcm_data.ProtocolName) # filter out unwanted non-image-data DICOMs by assigning # a series number < 0 (see test below) if not series_id[0] < 0 and mw.dcm_data[0x0008, 0x0016].repval in ( 'Raw Data Storage', 'GrayscaleSoftcopyPresentationStateStorage'): series_id = (-1, mw.dcm_data.ProtocolName) if per_studyUID: series_id = series_id + (file_studyUID, ) ingrp = False for idx in range(len(mwgroup)): # same = mw.is_same_series(mwgroup[idx]) if mw.is_same_series(mwgroup[idx]): # the same series should have the same study uuid assert (mwgroup[idx].dcm_data.get('StudyInstanceUID', None) == file_studyUID) ingrp = True if series_id[0] >= 0: series_id = (mwgroup[idx].dcm_data.SeriesNumber, mwgroup[idx].dcm_data.ProtocolName) if per_studyUID: series_id = series_id + (file_studyUID, ) groups[0].append(series_id) groups[1].append(idx) if not ingrp: mwgroup.append(mw) groups[0].append(series_id) groups[1].append(len(mwgroup) - 1) group_map = dict(zip(groups[0], groups[1])) total = 0 seqinfo = OrderedDict() # for the next line to make any sense the series_id needs to # be sortable in a way that preserves the series order for series_id, mwidx in sorted(group_map.items()): if series_id[0] < 0: # skip our fake series with unwanted files continue mw = mwgroup[mwidx] if mw.image_shape is None: # this whole thing has now image data (maybe just PSg DICOMs) # nothing to see here, just move on continue dcminfo = mw.dcm_data series_files = [ files[i] for i, s in enumerate(groups[0]) if s == series_id ] # turn the series_id into a human-readable string -- string is needed # for JSON storage later on if per_studyUID: studyUID = series_id[2] series_id = series_id[:2] accession_number = dcminfo.get('AccessionNumber') series_id = '-'.join(map(str, series_id)) size = list(mw.image_shape) + [len(series_files)] total += size[-1] if len(size) < 4: size.append(1) # MG - refactor into util function try: TR = float(dcminfo.RepetitionTime) / 1000. except (AttributeError, ValueError): TR = -1 try: TE = float(dcminfo.EchoTime) except (AttributeError, ValueError): TE = -1 try: refphys = str(dcminfo.ReferringPhysicianName) except AttributeError: refphys = '' try: image_type = tuple(dcminfo.ImageType) except AttributeError: image_type = '' try: series_desc = dcminfo.SeriesDescription except AttributeError: series_desc = '' motion_corrected = 'MOCO' in image_type if dcminfo.get([0x18, 0x24], None): # GE and Philips scanners sequence_name = dcminfo[0x18, 0x24].value elif dcminfo.get([0x19, 0x109c], None): # Siemens scanners sequence_name = dcminfo[0x19, 0x109c].value else: sequence_name = 'Not found' info = SeqInfo( total, op.split(series_files[0])[1], series_id, op.basename(op.dirname(series_files[0])), '-', '-', size[0], size[1], size[2], size[3], TR, TE, dcminfo.ProtocolName, motion_corrected, 'derived' in [x.lower() for x in dcminfo.get('ImageType', [])], dcminfo.get('PatientID'), dcminfo.get('StudyDescription'), refphys, dcminfo.get('SeriesDescription'), sequence_name, image_type, accession_number, # For demographics to populate BIDS participants.tsv dcminfo.get('PatientAge'), dcminfo.get('PatientSex'), dcminfo.get('AcquisitionDate'), ) # candidates # dcminfo.AccessionNumber # len(dcminfo.ReferencedImageSequence) # len(dcminfo.SourceImageSequence) # FOR demographics if per_studyUID: key = studyUID.split('.')[-1] elif per_accession_number: key = accession_number else: key = '' lgr.debug( "%30s %30s %27s %27s %5s nref=%-2d nsrc=%-2d %s" % (key, info.series_id, dcminfo.SeriesDescription, dcminfo.ProtocolName, info.is_derived, len(dcminfo.get('ReferencedImageSequence', '')), len(dcminfo.get('SourceImageSequence', '')), info.image_type)) if per_studyUID: if studyUID not in seqinfo: seqinfo[studyUID] = OrderedDict() seqinfo[studyUID][info] = series_files elif per_accession_number: if accession_number not in seqinfo: seqinfo[accession_number] = OrderedDict() seqinfo[accession_number][info] = series_files else: seqinfo[info] = series_files if per_studyUID: lgr.info( "Generated sequence info for %d studies with %d entries total", len(seqinfo), sum(map(len, seqinfo.values()))) elif per_accession_number: lgr.info( "Generated sequence info for %d accession numbers with %d " "entries total", len(seqinfo), sum(map(len, seqinfo.values()))) else: lgr.info("Generated sequence info with %d entries", len(seqinfo)) return seqinfo