def __init__(self, *args, **kwargs): DataParser.__init__(self, *args, **kwargs) if self.data is None: raise ValueError('IPAQ Parser: Data not loaded') self.type = '' # if 'type' in kwargs: # self.type = kwargs.get('type') # self.fields = self.dbi.getFields(self.type) # self.info = self.dbi.getInfo(self.type) # elif self.etype is not None: # self.type = self.etype self.data.dropna(axis=1, how='all', inplace=True) # self.data = self.data.filter(regex="^[^78]") fields = [ 'sitting', 'walking_days', 'walking_time', 'moderate_days', 'moderate_time', 'vigorous_days', 'vigorous_time', 'pa', 'mvpa' ] self.fields = fields self.data = self.data[['ID'] + fields] # self.data.columns = ['ID'] + fields self.data['ID'] = self.data.apply(lambda x: stripspaces(x, 0), axis=1) # self.data['ID'] = self.data['ID'].apply(lambda x: getID(x)) self.sortSubjects('ID') print('Data load complete') if self.info is None: self.info = {'prefix': 'IP', 'xsitype': 'opex:ipaq'}
def __init__(self, *args): DataParser.__init__(self, *args) fields = join(self.resource_dir, "dexa_fields.xlsx") #Replace field headers if access(fields, R_OK): self.fields = pd.read_excel(fields, header=0, sheetname='dexa_fields') df_header = pd.read_excel(fields, header=0, sheetname='dexa_header') self.header = df_header['concatenated'].tolist() self.data.columns = self.header print(("Loaded rows=", len(self.data['ID']))) #extract subject info df_subj = self.data.iloc[:,0:4] df_subj['SubjectID'] = df_subj.apply(lambda x: stripspaces(x, 'ID'), axis=1) #Split data into intervals self.intervals = {0:'BASELINE', 3:'MIDPOINT',6:'ENDPOINT', 9:'MID-FOLLOW-UP', 12:'FOLLOW-UP'} self.df = dict() for i,intval in list(self.intervals.items()): cols = [c for c in self.header if c.startswith(intval)] simplecols = [] for col in cols: cparts = col.split("_") simplecols.append("_".join(cparts[1:])) self.df[i] = pd.concat([df_subj,self.data[cols]], axis=1) self.df[i].columns = df_subj.columns.tolist() + simplecols #self.df[i].reindex(df_subj.columns.tolist() + simplecols, fill_value='') if DEBUG: msg ="Interval=%s data=%d" % (intval, len(self.df[i])) print(msg) self.sortSubjects('SubjectID') else: raise ValueError("Cannot access fields file: %s" % fields)
def __init__(self, *args): # super(AmunetParser, self).__init__(*args) - PYTHON V3 DataParser.__init__(self, *args) self.dates = dict() self.subjects = dict() self.interval = None self.sortSubjects()
def __init__(self, *args): DataParser.__init__(self, *args) #Replace field headers self.fields = ['depression', 'anxiety', 'stress'] ncols = [] for ix in range(0, 13, 3): ncols += [c + '_' + str(ix) for c in self.fields] dropcols = [] #remove check columns # self.data.set_index(list(self.data)[0], inplace=True) for n in range(3, len(self.data.columns), 12): start = n end = n + 9 dropcols += self.data.columns.tolist()[start:end] print(('Selecting Totals columns for ', dropcols)) df = self.data.drop(columns=dropcols) # #check num cols match and delete end cols in case blank ones have been included if len(['ID'] + ncols) < len(df.columns): df = df[df.columns[0:len(ncols)]] df.reset_index(inplace=True) df.columns = ['ID'] + ncols df.set_index('ID', inplace=True) self.data = df # #sort subjects self.data['SubjectID'] = self.data.index self.sortSubjects('SubjectID')
def __init__(self, *args): DataParser.__init__(self, *args) # cleanup subjects self.data['ID'] = self.data.apply(lambda x: stripspaces(x, 0), axis=1) # self.data['ID'] = self.data['ID'].apply(lambda x: getID(x)) if self.info is None: self.info = {'prefix': 'GDN', 'xsitype': 'opex:godin'} # Replace field headers self.fields = ['strenuous', 'moderate', 'light', 'total', 'sweat'] fields = ['strenuous', 'moderate', 'light', 'total', 'sweat'] cols = [ 'ID', 'Strenuous', 'Moderate', 'Light', 'Totalleisureactivityscore', 'Sweat(1,2,or3)' ] ncols = ['SubjectID'] renamecols = dict(list(zip(cols, ncols + fields))) df = self.data.iloc[:, 0:7] df.dropna(axis=0, how='any', thresh=5, inplace=True) # remove all empty rows df.fillna(999, inplace=True) # replace any remaining na with 999 df.rename(columns=renamecols, inplace=True) df.reindex() self.data = df # sort subjects self.sortSubjects('SubjectID') print('Data load complete')
def __init__(self, inputdir, fieldsfile='fmri.xlsx', *args): DataParser.__init__(self, *args) self.inputdir = inputdir self.fieldsfile = os.path.join(self.resource_dir, 'fields', fieldsfile) self.lookup() self.getdata() self.sortSubjects('Subject')
def __init__(self, *args): DataParser.__init__(self, *args) try: self.cantabNewFields() self.sortSubjects('Participant ID') except Exception as e: print(e) raise e
def __init__(self, *args): DataParser.__init__(self, *args) #Replace field headers self.fields = [ 'calories_burned', 'steps', 'distance', 'floors', 'min_sed', 'min_lightact', 'min_fairact', 'min_veryact', 'act_calories' ] self.sortSubjects('Subject') print('Data load complete')
def __init__(self, inputdir, filename, *args): DataParser.__init__(self, *args) self.inputdir = inputdir self.filename = filename self.get_date() self.fields = ['q' + str(i) for i in range(1, 42)] self.fields_comments = ['q42a', 'q42b', 'q42c'] self.score_fcas() self.sortSubjects(subjectfield='Subject')
def __init__(self, *args): DataParser.__init__(self, *args) cols = ["AttentionOrientation", "Memory", "Fluency", "Language", "Visuospatial", "MMSE", "ACERTotal"] self.fields = ['attention', 'memory', 'fluency', 'language', 'visuospatial', 'MMSE', 'total'] df = self.data df = df[['ID', 'TimePoint'] + cols] # df['ID'] = df['ID'].apply(lambda x: getID(x)) df.columns=['ID', 'interval'] + self.fields self.data = df self.sortSubjects()
def setUp(self): try: datafile = join(ROOTDATADIR, 'blood', 'MULTIPLEX', '2018-02-01 1058VB 1021LB 1107 1114.xlsx') sheet = 0 skip = 1 header = None etype = 'MULTIPLEX' self.dp = None self.dp = DataParser(datafile, sheet, skip, header, etype) except Exception as e: print(e)
def __init__(self, inputdir, *args): DataParser.__init__(self, *args) self.inputdir = inputdir self.etype = basename(dirname(self.inputdir)) self.interval = basename(inputdir)[0:-1] fields = ['CA1','CA2','DG','CA3','misc','SUB','ERC','BA35','BA36','PHC','sulcus', 'Hippoc'] flist = [[side + '_' + f for f in fields] for side in ['left', 'right']] self.fields = ['icv'] + [item for sublist in flist for item in sublist] + ['Total_Hippoc'] self.parse() self.sortSubjects('Subject')
def __init__(self, *args): DataParser.__init__(self, *args) # cleanup subjects self.data['ID'] = self.data.apply(lambda x: stripspaces(x, 0), axis=1) # self.data['ID'] = self.data['ID'].apply(lambda x: getID(x)) df = self.data # Replace field headers self.fields = ['current', 'past', 'total'] columns = ['CurrentResult', 'PastResult', 'TotalResult'] df.rename(columns=dict(list(zip(columns, self.fields))), inplace=True) self.data = df self.sortSubjects('ID') print('Data load complete')
def __init__(self, *args): DataParser.__init__(self, *args) path = r'resources\fields' fields = pd.read_csv(join( path, 'accel_fields.csv'))['ACCELEROMETRY'].values.tolist() self.type = basename(dirname(self.datafile)) self.location = basename(dirname(dirname(self.datafile))) print(self.location) if self.type == 'month': fields = [f for f in fields if f not in ['day', 'valid_day']] self.fields = fields self.data = self.data[['Subject', 'interval'] + self.fields] self.sortSubjects(subjectfield='Subject')
class TestDataparser(unittest.TestCase): def setUp(self): try: datafile = join(ROOTDATADIR, 'blood', 'MULTIPLEX', '2018-02-01 1058VB 1021LB 1107 1114.xlsx') sheet = 0 skip = 1 header = None etype = 'MULTIPLEX' self.dp = None self.dp = DataParser(datafile, sheet, skip, header, etype) except Exception as e: print(e) def tearDown(self): if self.dp is not None: self.dp.dbi.closeconn() def test_info(self): if self.dp is not None: expected = { 'prefix': u'MPX', 'xsitype': u'opex:bloodMultiplexData' } self.assertDictEqual(expected, self.dp.info) def test_fields(self): if self.dp is not None: expected = [ u'GH', u'Leptin', u'BDNF', u'IGFBP7', u'IL1', u'IL4', u'IL6', u'IL10' ] self.assertListEqual(expected, self.dp.fields) def test_prefix(self): if self.dp is not None and self.dp.info is not None: data = self.dp.getPrefix() expected = 'MPX' self.assertEqual(expected, data) def test_xsd(self): if self.dp is not None and self.dp.info is not None: data = self.dp.getxsd() expected = 'opex:bloodMultiplexData' self.assertEqual(expected, data)
def __init__(self, *args, **kwargs): DataParser.__init__(self, *args) #Maybe empty sheet if self.data.empty or len(self.data.columns) <= 1: msg = "No data available" raise ValueError(msg) # cleanup subjects self.data['ID'] = self.data.apply(lambda x: stripspaces(x, 0), axis=1) if self.info is None: self.info = {'prefix': 'INS', 'xsitype': 'opex:insomnia'} # Replace field headers self.fieldmap = { 'q1': 'Q1', 'q2': 'Q2', 'q3': 'Q3', 'q4': 'Q4', 'q5': 'Q5', 'q6': 'Q6', 'q7': 'Q7', 'total': 'TotalScore' } cols = [ 'ID', self.fieldmap['q1'], self.fieldmap['q2'], self.fieldmap['q3'], self.fieldmap['q4'], self.fieldmap['q5'], self.fieldmap['q6'], self.fieldmap['q7'], self.fieldmap['total'] ] # self.fieldmap = {'total': 'TotalScore'} self.fields = list(self.fieldmap.keys()) ncols = ['SubjectID'] + self.fields # cols = ['ID', self.fieldmap['total']] # zeros have been entered when should be blank self.data[self.fieldmap['total']] = self.data.apply( lambda x: self.nodatarow(x, self.fieldmap['total']), axis=1) self.data[cols[1:]] = self.data.apply( lambda x: self.nodatarow(x, cols[1:]), axis=1) df = self.data[cols] df = df.astype(object) # convert to object df.columns = ncols df.reindex() self.data = df # sort subjects self.sortSubjects('SubjectID') print('Data load complete')
def __init__(self, *args): DataParser.__init__(self, *args) df = self.data df['Subject'] = df['subjname'].apply( lambda x: re.findall('(?<=sub-)(.*)(?=_)', x)[0]) df['interval'] = df['subjname'].apply( lambda x: int(re.findall('(?<=ses-)(.*)', x)[0])) renamecols = { 'ICV': 'icv', 'right_subiculum': 'right_SUB', 'left_subiculum': 'left_SUB' } df.rename(columns=renamecols, inplace=True) self.cols = [c for c in self.fields if c in df.columns] self.data = df[['Subject', 'interval'] + self.cols] self.sortSubjects('Subject')
def __init__(self, *args): DataParser.__init__(self, *args) # self.convertScores() # self.scoreSF() # pop_fields = ['GenPop_PF', 'GenPop_RP', 'GenPop_BP', 'GenPop_GH', 'GenPop_VT', 'GenPop_SF', 'GenPop_RE', # 'GenPop_MH', 'GenPop_PCS', 'GenPop_MCS'] self.fields = ['PF', 'RP', 'BP', 'GH', 'VT', 'SF', 'RE', 'MH', 'PCS', 'MCS'] df = self.data df[['Subject', 'interval']] = df.pop('RecordID').str.split('\\s', 1, expand=True) df['interval'] = df['interval'].apply(extract_interval) # df['Subject'] = df['Subject'].apply(lambda x: getID(x)) self.data = df # self.data = df[['Subject', 'interval'] + self.fields] self.sortSubjects('Subject')
def setUp(self): try: datafile = join(ROOTDATADIR, 'cantab', 'RowBySession_HealthyBrains_20180504.csv') sheet = 0 skip = 0 header = None etype = 'CANTAB' self.dp = None self.dp = DataParser(datafile, sheet, skip, header, etype) except Exception as e: print(e)
def __init__(self, *args): DataParser.__init__(self, *args) # cleanup subjects self.data['ID'] = self.data.apply(lambda x: stripspaces(x, 0), axis=1) # self.data['ID'] = self.data['ID'].apply(lambda x: getID(x)) if self.info is None: self.info = {'prefix': 'GDN', 'xsitype': 'opex:paces'} # Replace field headers self.fields = ['q' + str(i) for i in range(1, 9)] + ['total', 'enjoy_percent'] columns = [ 'Q1', 'Q2', 'Q3', 'Q4', 'Q5', 'Q6', 'Q7', 'Q8', 'SumTotal', '%Enjoyment' ] renamecols = dict(list(zip(columns, self.fields))) self.data.rename(columns=renamecols, inplace=True) self.sortSubjects('ID') print('Data load complete')
def __init__(self, *args): DataParser.__init__(self, *args) # Maybe empty sheet if self.data.empty or len(self.data.columns) <= 1: msg = "No data available" raise ValueError(msg) # cleanup subjects self.data['ID'] = self.data.apply(lambda x: stripspaces(x, 0), axis=1) # self.data['ID'] = self.data['ID'].apply(lambda x: getID(x)) if self.info is None: self.info = {'prefix': 'PSQ', 'xsitype': 'opex:psqi'} # Replace field headers self.fields = ['c'+str(i) for i in range(1,8)] + ['total'] ncols = ['SubjectID'] + self.fields cols = ['ID'] + [c for c in self.data.columns if (isinstance(c,str) or isinstance(c,str)) and c.startswith('Component')] + ['total'] df = self.data[cols] df.columns = ncols df.reindex() self.data = df # sort subjects self.sortSubjects('SubjectID') print('Data load complete')
def __init__(self, inputdir, inputsubdir, datafile, testonly=False): DataParser.__init__(self, etype='COSMED') self.inputdir = inputdir self.testonly = testonly # Load fields fieldsfile = join(self.getResourceDir(), "cosmed_fields.xlsx") self.subjectdataloc = pd.read_excel(fieldsfile, header=0, sheet_name='cosmed') self.fields = pd.read_excel(fieldsfile, header=0, sheet_name='cosmed_xnat') self.datafields = pd.read_excel(fieldsfile, header=0, sheet_name='cosmed_data') # Get list of subjects - parse individual files self.subjects = dict() self.files = glob.glob(join(inputsubdir, "*.xlsx")) # create an output dir for processed files pdir = join(inputsubdir, 'processed') if not isdir(pdir): mkdir(pdir) # Load efficiency data from single file self.effdata_cols = { '0': [9, 12], '3': [13, 16], '6': [17, 20], '9': [21, 24], '12': [25, 28] } self.effdata = self.__loadEfficiencydata(datafile) # Load data from files self.loaded = self.__loadData()
def __init__(self, *args): DataParser.__init__(self, *args) self.training_file = pd.ExcelFile(self.datafile) self.type = re.findall('(?<=Diary-)[A-Z]{3}', basename(self.datafile))[0] self.extraction = {'AIT': extract_AIT, 'MIT': extract_MIT, 'LIT': extract_LIT} self.getData()
def __init__(self, inputdir, *args): DataParser.__init__(self, *args) self.inputdir = inputdir self.getData() self.sortSubjects('Subject')
def __init__(self, *args): DataParser.__init__(self, *args) self.get_data() self.sortSubjects('Subject')
def __init__(self, *args, **kwargs): DataParser.__init__(self, *args) if self.data is None: raise ValueError('BloodParser: Data not loaded') self.type = '' if 'type' in kwargs: self.type = kwargs.get('type') self.fields = self.dbi.getFields(self.type) self.info = self.dbi.getInfo(self.type) # self.fields = self.getFieldsFromFile(self.type) elif self.etype is not None: self.type = self.etype print('Rename Headers for ', self.type) ## Rename columns in dataframe if self.type == 'IGF': colnames = {'Date': 'A_Date', 'Participant ID ': 'Participant ID', 'Timepoint': 'Sample ID', 'IGF-1': 'IGF1'} self.data = self.data.rename(index=str, columns=colnames) elif self.type == 'SOMATO': colnames = {'Date': 'A_Date', 'Participant ID ': 'Participant ID', 'Timepoint': 'Sample ID', 'Somatostatin': 'somatostatin'} self.data = self.data.rename(index=str, columns=colnames) elif self.type == 'BDNF': colnames = {'Date': 'A_Date', 'Participant ID ': 'Participant ID', 'Timepoint': 'Sample ID'} self.data = self.data.rename(index=str, columns=colnames) elif self.type == 'MULTIPLEX': colnames = {'Date': 'A_Date', 'Participant ID ': 'Participant ID', 'Timepoint': 'Sample ID', 'IGFBP-7': 'IGFBP7'} self.data = self.data.rename(index=str, columns=colnames) elif self.type == 'INFLAM': print('Headers for ', self.type) colnames = {'Date': 'A_Date', 'Participant ID ': 'Participant ID', 'Timepoint': 'Sample ID', u'IFN\u03b3': 'ifngamma', 'IL-10': 'il10', 'IL-12(p70)': 'il12p70', u'IL-1\u03b2': 'il1beta', 'IL-6': 'il6', 'IL-8': 'il8cxcl8', u'TNF\u03B1': 'tnfalpha' } self.data = self.data.rename(index=str, columns=colnames) elif self.type == 'ELISAS': colnames = {'Date': 'A_Date', 'Participant ID ': 'Participant ID', 'Timepoint': 'Sample ID', 'Beta-H (ng/ul)': 'BetaHydroxy'} self.data = self.data.rename(index=str, columns=colnames) elif self.type == 'COBAS': # Name unnamed columns to field names if self.fields[0] not in self.data.columns: colnames = {} v = 1 for i in range(len(self.fields)): colnames['Value.' + str(v)] = self.fields[i] v = v + 2 else: colnames = {'Date': 'A_Date', 'Participant ID ': 'Participant ID', 'Timepoint': 'Sample ID', 'Prolactin': 'Prolactin', 'Insulin': 'Insulin', 'HGH': 'HGH', 'Cortisol': 'Cortisol'} self.data = self.data.rename(index=str, columns=colnames) print('Colnames: ', self.data.columns.tolist()) # Insert Row Number column if 'R_No.' not in self.data.columns: self.data.insert(0, 'R_No.', list(range(len(self.data)))) # Remove NaT rows i = self.data.query('A_Date =="NaT"') if not i.empty: self.data.drop(i.index[0], inplace=True) print('NaT row dropped') # Organize data into subjects subjectfield = 'Participant ID' if subjectfield not in self.data.columns: raise ValueError('Subject ID field not present: ', subjectfield) self.data[subjectfield] = self.data[subjectfield].str.replace(" ", "") self.sortSubjects(subjectfield) if self.subjects is not None: print('BloodParser: subjects loaded successfully') self.subjectfield = subjectfield
def __init__(self, *args): DataParser.__init__(self, *args) # self.opex = pd.read_csv(join(self.resource_dir, 'opex.csv')) self.expts = dict()
def __init__(self, **kwargs): # super(AmunetParser, self).__init__(*args) - PYTHON V3 DataParser.__init__(self, **kwargs) self.data = missingData(self.datafile) self.sortSubjects('Subject')