def harvest(self,*filenames): """ Extract the variable data from the provided files Args: filenames (list): the files to extract from currently supported: {} Returns: pd.Series or pd.DataFrame """.format(REGISTERED_FILEEXTENSIONS.__repr__()) if self.is_harvested: return data = pd.Series() if self.defsize == 1: data = pd.DataFrame() for filename in filenames: ext = f.strip_all_endings(filename)[1] assert ext in REGISTERED_FILEEXTENSIONS, "Filetype {} not known!".format(ext) assert os.path.exists(filename), "File {} does not exist!".format(filename) #Logger.debug("Attempting to harvest file {0}".format(filename)) data = self.harvest_single_file(filename,ext) #self.data = self.data.append(data.map(self.transform)) #concat should be much faster if isinstance(data,pd.Series): self.data = pd.concat([self.data,data.map(self.transform)]) else: self.data = pd.concat([self.data,data]) del data self.declare_harvested() return None
def harvest(filenames,definitions,**kwargs): """ Extract the variable data from the provided files Args: filenames (list): the files to extract from currently supported: {0} Keyword Args: transformation (func): will be applied to the read out data Returns: pd.Series or pd.DataFrame """.format(REGISTERED_FILEEXTENSIONS.__repr__()) data = pd.Series() for filename in filenames: filetype = f.strip_all_endings(filename)[1] assert filetype in REGISTERED_FILEEXTENSIONS, "Filetype {} not known!".format(filetype) assert os.path.exists(filename), "File {} does not exist!".format(filetype) Logger.debug("Attempting to harvest {1} file {0}".format(filename,filetype)) if filetype == ".h5" and not isinstance(filename, tables.table.Table): # store = pd.HDFStore(filename) hdftable = tables.openFile(filename) else: hdftable = filename tmpdata = pd.Series() for definition in definitions: if filetype == ".h5": try: # data = store.select_column(*definition) tmpdata = hdftable.getNode("/" + definition[0]).col(definition[1]) tmpdata = pd.Series(tmpdata, dtype=n.float64) Logger.debug("Found {} entries in table for {}{}".format(len(tmpdata),definition[0],definition[1])) break except tables.NoSuchNodeError: Logger.debug("Can not find definition {0} in {1}! ".format(definition, filename)) continue elif filetype == ".root": tmpdata = rn.root2rec(filename, *definition) tmpdata = pd.Series(data) if filetype == ".h5": hdftable.close() #tmpdata = harvest_single_file(filename, filetype,definitions) # self.data = self.data.append(data.map(self.transform)) # concat should be much faster if "transformation" in kwargs: transform = kwargs['transformation'] data = pd.concat([data, tmpdata.map(transform)]) else: data = pd.concat([data, tmpdata]) del tmpdata return data