class DataParser(object): def __init__(self, datafile=None, sheet=0, skiplines=0, header=None, etype=None): msg = 'DataParser: datafile=%s sheet=%s skiplines=%s header=%s etype=%s' % ( datafile, str(sheet), str(skiplines), str(header), str(etype)) print(msg) logging.info(msg) self.datafile = datafile # full pathname to data file self.resource_dir = findResourceDir() configdb = join(self.resource_dir, 'opexconfig.db') if not access(configdb, R_OK): print(configdb) raise IOError('Cannot find config database {}'.format(configdb)) try: self.dbi = DBI(configdb) self.etype = etype if etype is not None: self.info = self.dbi.getInfo(etype) self.fields = self.dbi.getFields(etype) else: self.info = None self.fields = None self.subjects = None self.incorrect = self.dbi.getIDs() if self.datafile is not None and len(self.datafile) > 0: (bname, extn) = splitext(basename(self.datafile)) self.ftype = extn # extension - xlsx or csv self.sheet = sheet self.skiplines = skiplines self.header = header self._loadData() except Exception as e: raise e def getInfoFromFile(self, etype): # Read expt info info = None try: opex = pandas.read_csv(join(self.resource_dir, 'opex.csv')) info = opex[opex['Expt'] == etype] except Exception as e: raise ValueError("Unable to get expt info from file", e) return info def getIdsFromFile(self): # Read expt info info = None try: info = pandas.read_csv(join(self.resource_dir, 'incorrectIds.csv')) except Exception as e: raise ValueError("Unable to get ids from file", e) return info def __checkSID(self, sid): """ Replace known incorrect IDs from db :param sid: :return: """ if self.dbi is not None: rsid = self.dbi.getCorrectID(sid) if rsid != sid: msg = 'Subject: %s corrected to %s' % (sid, rsid) logging.warning(msg) else: rsid = sid return rsid def _loadData(self): if self.ftype == '.xlsx' or self.ftype == '.xls': try: if self.header is None: self.data = pandas.read_excel(self.datafile, skiprows=self.skiplines, sheet_name=self.sheet, skip_blank_lines=True, encoding='utf-8') else: self.data = pandas.read_excel(self.datafile, skiprows=self.skiplines, sheet_name=self.sheet, skip_blank_lines=True, header=self.header, encoding='utf-8') except IndexError as e: msg = 'Excel sheet number/name expected was {} but not found: '.format( self.sheet) + e.message logging.error(msg) print(msg) elif self.ftype == '.csv': self.data = pandas.read_csv(self.datafile, skip_blank_lines=True) else: self.data = None if self.data is not None: msg = 'Data loaded from %s' % self.datafile logging.info(msg) print(msg) self.data.dropna(how="all", axis=0, inplace=True) # cleanup if rows are all NaN self.data.fillna("") # replace remaining NaNs with empty string else: print('No data to load') def sortSubjects(self, subjectfield='ID'): ''' Sort data into subjects by participant ID - this should be overwritten if the data is organized differently ''' self.subjects = dict() if self.data is not None: if subjectfield not in self.data.columns: raise ValueError('Subject ID field not present: ', subjectfield) ids = self.data[subjectfield].unique() for sid in ids: if len(str(sid)) == 6: sidkey = self.__checkSID(sid) self.subjects[sidkey] = self.data[self.data[subjectfield] == sid] msg = 'Subjects loaded=%d' % len(self.subjects) print(msg) def formatDobNumber(self, orig): """ Reformats DOB string from Excel data float to yyyy-mm-dd """ dateoffset = 693594 dt = datetime.fromordinal(dateoffset + int(orig)) return dt.strftime("%Y-%m-%d") def formatCondensedDate(self, orig): """ Reformats date number from Excel to yyyymmdd """ dateoffset = 693594 dt = datetime.fromordinal(dateoffset + int(orig)) return dt.strftime("%Y%m%d") def getPrefix(self): prefix = None if self.info is not None: prefix = self.info['prefix'] return prefix def getxsd(self): xsd = None if self.info is not None: xsd = self.info['xsitype'] return xsd
class TestDBquery(unittest.TestCase): def setUp(self): self.resourcedir = findResourceDir() configdb = join(self.resourcedir, 'opexconfig_test.db') self.dbi = DBI(configdb) self.dbi.getconn() def tearDown(self): self.dbi.conn.close() def test_getIDs(self): data = self.dbi.getIDs() self.assertGreater(len(data), 0) def test_updateIDs(self): df = pandas.read_csv(join('..', 'resources', 'incorrectIds.csv')) idlist = [(d['INCORRECT'], d['CORRECT']) for i, d in df.iterrows()] cnt = self.dbi.addIDs(idlist) expected = len(idlist) self.assertEqual(expected, cnt) def test_getRunOptions(self): data = self.dbi.getRunOptions() self.assertGreater(len(data), 0) def test_getFields(self): etype = 'CANTAB MOT' expected = [u'MOTML', u'MOTSDL', u'MOTTC', u'MOTTE'] data = self.dbi.getFields(etype) print(etype, ": ", data) self.assertGreater(len(data), 0) self.assertListEqual(expected, data) def test_getInfo(self): etype = 'MULTIPLEX' expected = {'prefix': u'MPX', 'xsitype': u'opex:bloodMultiplexData'} data = self.dbi.getInfo(etype) print(etype, ": ", data) self.assertGreater(len(data), 0) self.assertDictEqual(expected, data) def test_getInfo_missing(self): etype = 'CANTAB' data = self.dbi.getInfo(etype) print(etype, ": ", data) self.assertIsNone(data) def test_getCorrectID(self): incorrectid = '1040DR' correctid = '1040DA' cid = self.dbi.getCorrectID(incorrectid) self.assertEqual(correctid, cid) def test_getCorrectID_missing(self): incorrectid = '1020HC' cid = self.dbi.getCorrectID(incorrectid) self.assertEqual(incorrectid, cid) def test_getDatelessExpts(self): data = self.dbi.getDatelessExpts() self.assertGreater(len(data), 0) def test_getExpts(self): data = self.dbi.getExpts() self.assertGreater(len(data), 0) def test_getXsitypeFromPrefix(self): prefix = 'MPX' expected = 'opex:bloodMultiplexData' data = self.dbi.getXsitypeFromPrefix(prefix) self.assertEqual(expected, data) def test_getTotal(self): expt = 'GODIN' expected = 5 data = self.dbi.getTotal(expt) self.assertEqual(expected, data) def test_getInterval(self): expt = 'GODIN' expected = 3 data = self.dbi.getInterval(expt) self.assertEqual(expected, data) def test_getInfo_TASK(self): """ Checking taskret and taskencode """ expt = 'TASKRET' expected = 'opex:fmritaskret' data = self.dbi.getInfo(expt) self.assertEqual(expected, data['xsitype']) fields = self.dbi.getFields(expt) self.assertGreater(len(fields), 0) expt = 'TASKENCODE' expected = 'opex:fmritaskencode' data = self.dbi.getInfo(expt) self.assertEqual(expected, data['xsitype']) fields = self.dbi.getFields(expt) self.assertGreater(len(fields), 0)