def getPairDiff( self, filepath, querypath, numberrange, dayrange, firstcol, lastcol, excel ): #import data #determine type of import: csv of excel filetype = os.path.basename(filepath).split(".")[1] if filetype == "csv": data = pd.DataFrame.from_csv(filepath) elif filetype == "xlsx" or filetype == "xls": data = pd.read_excel(filepath, sheetname = excel, index_col = 0) #reads the sheet indexed by "excel" - it's python indexed. self.data = copy(data) #original data handle_range(data, numberrange, firstcol, lastcol) #process range issue, i.e. [number] to [number] self.datarange = copy(data) #save the range dataset #process subset selection from query.xml take data, give datadrop inclusion = createQuery(querypath, data.shape[0], data) datadrop = data.ix[[x[0] for x in enumerate(inclusion) if x[1]]] #0 is row, 1 is true if the row should be included self.dataquery = datadrop #save the query inclusion #process data with multiple days. 0 is average, 1 is first, 2 is second, -1 is keep all rows #take datadrop, give dataproc dataproc = handle_days(datadrop, dayrange, firstcol, lastcol) m, n = dataproc.shape self.datadays = dataproc #create difference take dataproc datamat = dataproc.iloc[:,range(firstcol,lastcol + 1)] self.datamat = datamat diff = pairdiff(datamat) diffmeas = pd.DataFrame(data = diff, index = dataproc.columns[firstcol:lastcol+1], columns = ["diff"]) #add summary information to output outmean, outsd, outnan, outsamplesize = summaries(dataproc, firstcol, lastcol) diffmeas['mean'] = pd.Series(outmean, index = diffmeas.index) diffmeas['sd'] = pd.Series(outsd, index = diffmeas.index) diffmeas['count_nan'] = pd.Series(outnan, index = diffmeas.index) diffmeas['sample_size'] = pd.Series(outsamplesize, index = diffmeas.index) self.pairdiff = diffmeas self.debugger(filepath) return(diffmeas)
def getPairDiff(self, filepath, querypath, numberrange, dayrange, firstcol, lastcol, excel): #import data #determine type of import: csv of excel filetype = os.path.basename(filepath).split(".")[1] if filetype == "csv": data = pd.DataFrame.from_csv(filepath) elif filetype == "xlsx" or filetype == "xls": data = pd.read_excel( filepath, sheetname=excel, index_col=0 ) #reads the sheet indexed by "excel" - it's python indexed. self.data = copy(data) #original data handle_range(data, numberrange, firstcol, lastcol) #process range issue, i.e. [number] to [number] self.datarange = copy(data) #save the range dataset #process subset selection from query.xml take data, give datadrop inclusion = createQuery(querypath, data.shape[0], data) datadrop = data.ix[[ x[0] for x in enumerate(inclusion) if x[1] ]] #0 is row, 1 is true if the row should be included self.dataquery = datadrop #save the query inclusion #process data with multiple days. 0 is average, 1 is first, 2 is second, -1 is keep all rows #take datadrop, give dataproc dataproc = handle_days(datadrop, dayrange, firstcol, lastcol) m, n = dataproc.shape self.datadays = dataproc #create difference take dataproc datamat = dataproc.iloc[:, range(firstcol, lastcol + 1)] self.datamat = datamat diff = pairdiff(datamat) diffmeas = pd.DataFrame(data=diff, index=dataproc.columns[firstcol:lastcol + 1], columns=["diff"]) #add summary information to output outmean, outsd, outnan, outsamplesize = summaries( dataproc, firstcol, lastcol) diffmeas['mean'] = pd.Series(outmean, index=diffmeas.index) diffmeas['sd'] = pd.Series(outsd, index=diffmeas.index) diffmeas['count_nan'] = pd.Series(outnan, index=diffmeas.index) diffmeas['sample_size'] = pd.Series(outsamplesize, index=diffmeas.index) self.pairdiff = diffmeas self.debugger(filepath) return (diffmeas)
print "No filters specified in xml query." return(query.include) if __name__ == "__main__": ######################## #remove this later lastcol = 10 firstcol = 1 import pandas as pd from process import handle_range, handle_days from pairdiff import pairdiff data = pd.DataFrame.from_csv("../DmelClockTimeSeriesSearch-2015-03-26--DataTable3.csv") handle_range(data,1) #process range issue dataproc = handle_days(data,0) #processed data with dates m, n = dataproc.shape ######################## #xml processing xmldoc = minidom.parse('query.xml') #only user input is here rawexclusion = xmldoc.getElementsByTagName('rowMod') rawType = str(rawexclusion[0].attributes['type'].value) rawValue = [int(x) for x in (rawexclusion[0].firstChild.nodeValue).split(',')] filters = xmldoc.getElementsByTagName('filter') #build query object query = filterobj(m,dataproc) if rawType == 'include':