def population_project(self, year_length=None, method=None, growth_rate=None): """ Continuation of population to provide convergent present values Parameters ---------- year_length : int, default None Duration to continue the population projection method : str, default None The value must be 'stable' or 'exp_growth' """ if 'pop' not in self.columns: raise Exception('pop is not a column of cohort') if year_length is None: raise Exception('a duration in years should be provided') if method is None: raise Exception('a method should be specified') years = self.index_sets['year'] first_year = min(years) last_year = max(years) if (first_year + year_length) > last_year: new_last_year = first_year + year_length else: return if method == 'stable': last_pop = self.xs(last_year, level='year', axis=0) pop = DataFrame(self['pop']) years = range(last_year + 1, new_last_year + 1) list_df = [last_pop] * len(years) pop = concat(list_df, keys=years, names=['year']) pop = pop.reorder_levels(['age', 'sex', 'year'], axis=0) combined = self.combine_first(pop) self.__init__(data=combined, columns=['pop']) if method == 'exp_growth': if growth_rate is None: raise Exception( 'a growth rate must be provided for the method') last_pop = self.xs(last_year, level='year', axis=0) pop = DataFrame(self['pop']) years = range(last_year + 1, new_last_year + 1) list_df = [last_pop] * len(years) pop = concat(list_df, keys=years, names=['year']) pop = pop.reorder_levels(['age', 'sex', 'year'], axis=0) pop = Cohorts(pop) pop.gen_grth(growth_rate) pop['pop'] *= pop['grth'] del pop['grth'] combined = self.combine_first(pop) self.__init__(data=combined, columns=['pop'])
def test_reorder_levels(self): index = MultiIndex(levels=[['bar'], ['one', 'two', 'three'], [0, 1]], labels=[[0, 0, 0, 0, 0, 0], [0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1]], names=['L0', 'L1', 'L2']) df = DataFrame({'A': np.arange(6), 'B': np.arange(6)}, index=index) # no change, position result = df.reorder_levels([0, 1, 2]) assert_frame_equal(df, result) # no change, labels result = df.reorder_levels(['L0', 'L1', 'L2']) assert_frame_equal(df, result) # rotate, position result = df.reorder_levels([1, 2, 0]) e_idx = MultiIndex(levels=[['one', 'two', 'three'], [0, 1], ['bar']], labels=[[0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1], [0, 0, 0, 0, 0, 0]], names=['L1', 'L2', 'L0']) expected = DataFrame({'A': np.arange(6), 'B': np.arange(6)}, index=e_idx) assert_frame_equal(result, expected)
def test_reorder_levels(self): index = MultiIndex(levels=[['bar'], ['one', 'two', 'three'], [0, 1]], labels=[[0, 0, 0, 0, 0, 0], [0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1]], names=['L0', 'L1', 'L2']) df = DataFrame({'A': np.arange(6), 'B': np.arange(6)}, index=index) # no change, position result = df.reorder_levels([0, 1, 2]) assert_frame_equal(df, result) # no change, labels result = df.reorder_levels(['L0', 'L1', 'L2']) assert_frame_equal(df, result) # rotate, position result = df.reorder_levels([1, 2, 0]) e_idx = MultiIndex(levels=[['one', 'two', 'three'], [0, 1], ['bar']], labels=[[0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1], [0, 0, 0, 0, 0, 0]], names=['L1', 'L2', 'L0']) expected = DataFrame({ 'A': np.arange(6), 'B': np.arange(6) }, index=e_idx) assert_frame_equal(result, expected)
def population_project(self, year_length = None, method = None, growth_rate = None): """ Continuation of population to provide convergent present values Parameters ---------- year_length : int, default None Duration to continue the population projection method : str, default None The value must be 'stable' or 'exp_growth' """ if 'pop' not in self.columns: raise Exception('pop is not a column of cohort') if year_length is None: raise Exception('a duration in years should be provided') if method is None: raise Exception('a method should be specified') years = self.index_sets['year'] first_year = min(years) last_year = max(years) if ( first_year + year_length ) > last_year: new_last_year = first_year + year_length else: return if method == 'stable': last_pop = self.xs(last_year, level='year', axis=0) pop = DataFrame(self['pop']) years = range(last_year+1,new_last_year+1) list_df = [last_pop] * len(years) pop = concat(list_df, keys = years, names =['year']) pop = pop.reorder_levels(['age','sex','year'], axis=0) combined = self.combine_first(pop) self.__init__(data = combined, columns = ['pop']) if method == 'exp_growth': if growth_rate is None: raise Exception('a growth rate must be provided for the method') last_pop = self.xs(last_year, level='year', axis=0) pop = DataFrame(self['pop']) years = range(last_year+1,new_last_year+1) list_df = [last_pop] * len(years) pop = concat(list_df, keys = years, names =['year']) pop = pop.reorder_levels(['age','sex','year'], axis=0) pop = Cohorts(pop) pop.gen_grth(growth_rate) pop['pop'] *= pop['grth'] del pop['grth'] combined = self.combine_first(pop) self.__init__(data = combined, columns = ['pop'])
def read_data_file(fn, skiplines=1, maxlines=False): """ A function to read any foam data files returning data and index after header """ # TODO check if sorting the index gives any performance benefits # print "opening file {}".format(fn) if not os.path.exists(fn): print("Can not open file " + fn) return None try: with open(fn, encoding="utf-8") as f: field = fn.split('/')[-1] content = f.readlines() content.append('bla') start, num_entries = if_header_skip(content) entries = len(content[start].split()) is_a_vector = (True if entries > 1 else False) end = start + num_entries if is_a_vector: data = list( map(lambda x: re.sub("[0-9]*\(|\)", '', x).split(), content[start:end:skiplines])) loc, names = evaluate_names(fn, entries) df = DataFrame(data=data, columns=names) if loc: df['Loc'] = loc else: df['Loc'] = range(len(df)) df.set_index('Loc', append=True, inplace=True) df.index.names = ['Id', 'Loc'] df = df.reorder_levels(['Loc', 'Id']) df = df.astype(float) hashes = {} for row in df.columns: hashes.update({row: hash_series(df[row])}) return names, df, hashes else: data = [np.float32(x) for x in content[start:end:skiplines]] entries = 1 df = DataFrame(data=data, columns=[field]) df['Loc'] = "Field" df.set_index('Loc', append=True, inplace=True) df.index.names = ['Id', 'Loc'] df = df.reorder_levels(['Loc', 'Id']) hashes = { field: int( hashlib.md5(str(data).encode('utf-8')).hexdigest(), 16) } return field, df, hashes except Exception as e: if DEBUG: print("Error processing datafile " + fn) print(e) return None
def read_data_file(fn, skiplines=1, maxlines=False): """ A function to read any foam data files returning data and index after header """ # TODO check if sorting the index gives any performance benefits # print "opening file {}".format(fn) if not os.path.exists(fn): print("Can not open file " + fn) return None try: with open(fn, encoding="utf-8") as f: field = fn.split('/')[-1] content = f.readlines() content.append('bla') start, num_entries = if_header_skip(content) entries = len(content[start].split()) is_a_vector = (True if entries > 1 else False) end = start + num_entries if is_a_vector: data = list(map(lambda x: re.sub("[0-9]*\(|\)", '', x).split(), content[start:end:skiplines])) loc, names = evaluate_names(fn, entries) df = DataFrame(data=data, columns=names) if loc: df['Loc'] = loc else: df['Loc'] = range(len(df)) df.set_index('Loc', append=True, inplace=True) df.index.names=['Id','Loc'] df = df.reorder_levels(['Loc','Id']) df = df.astype(float) hashes = {} for row in df.columns: hashes.update({row: hash_series(df[row])}) return names, df, hashes else: data = [np.float32(x) for x in content[start:end:skiplines]] entries = 1 df = DataFrame(data=data, columns=[field]) df['Loc'] = "Field" df.set_index('Loc', append=True, inplace=True) df.index.names=['Id','Loc'] df = df.reorder_levels(['Loc','Id']) hashes = {field: int(hashlib.md5(str(data).encode('utf-8')).hexdigest(),16)} return field, df, hashes except Exception as e: if DEBUG: print("Error processing datafile " + fn) print(e) return None
def test_reorder_levels(self): index = MultiIndex( levels=[["bar"], ["one", "two", "three"], [0, 1]], codes=[[0, 0, 0, 0, 0, 0], [0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1]], names=["L0", "L1", "L2"], ) df = DataFrame({"A": np.arange(6), "B": np.arange(6)}, index=index) # no change, position result = df.reorder_levels([0, 1, 2]) tm.assert_frame_equal(df, result) # no change, labels result = df.reorder_levels(["L0", "L1", "L2"]) tm.assert_frame_equal(df, result) # rotate, position result = df.reorder_levels([1, 2, 0]) e_idx = MultiIndex( levels=[["one", "two", "three"], [0, 1], ["bar"]], codes=[[0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1], [0, 0, 0, 0, 0, 0]], names=["L1", "L2", "L0"], ) expected = DataFrame({ "A": np.arange(6), "B": np.arange(6) }, index=e_idx) tm.assert_frame_equal(result, expected) result = df.reorder_levels([0, 0, 0]) e_idx = MultiIndex( levels=[["bar"], ["bar"], ["bar"]], codes=[[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0]], names=["L0", "L0", "L0"], ) expected = DataFrame({ "A": np.arange(6), "B": np.arange(6) }, index=e_idx) tm.assert_frame_equal(result, expected) result = df.reorder_levels(["L0", "L0", "L0"]) tm.assert_frame_equal(result, expected)
def test_pandas_extend_index(self): d1 = DataFrame(data=[2, 4, 6, 8], columns=["A"], index=[1, 2, 3, 4]) d1.index.name = "first" d1["second"] = "default" d1.set_index(["second"], append=True, inplace=True) self.assertEqual(d1.index.names, ["first", "second"]) d1 = d1.reorder_levels(["second", "first"]) self.assertEqual(d1.index.names, ["second", "first"])
def aggregate_chunks(mod_features_df, modality): without_info_df = mod_features_df.query('field != "info"') cnt_df = DataFrame([list(mod_features_df.loc[('info', 'count'), :].values)] * len(without_info_df), index=without_info_df.index) agg_df = without_info_df * cnt_df agg_df = DataFrame(agg_df.sum(axis=1) / cnt_df.sum(axis=1), index=without_info_df.index) agg_df['modality'] = modality agg_df.set_index('modality', append=True, inplace=True) agg_df = agg_df.reorder_levels(['modality', 'field', 'feature']) return agg_df
def sort_hierarchical_data(data: pd.DataFrame) -> pd.DataFrame: """Reorder index labels of a hierarchical index and sort in level order.""" sort_order = [ "location", "sex", "age_start", "age_end", "year_start", "year_end" ] sorted_data_index = [n for n in sort_order if n in data.index.names] sorted_data_index.extend( [n for n in data.index.names if n not in sorted_data_index]) if isinstance(data.index, pd.MultiIndex): data = data.reorder_levels(sorted_data_index) data = data.sort_index() return data
def reshape(data: pd.DataFrame, value_cols: List = DRAW_COLUMNS) -> pd.DataFrame: if isinstance(data, pd.DataFrame) and not isinstance( data.index, pd.MultiIndex): # push all non-val cols into index data = data.set_index( get_ordered_index_cols(data.columns.difference(value_cols))) elif not data.columns.difference( value_cols ).empty: # we missed some columns that need to be in index data = data.set_index(list(data.columns.difference(value_cols)), append=True) data = data.reorder_levels( get_ordered_index_cols(set(data.index.names))) else: # we've already set the full index pass return data
def add_elo_rating(data_frame: pd.DataFrame): """Add ELO rating of team prior to matches""" if "score" not in data_frame.columns or "oppo_score" not in data_frame.columns: raise ValueError( "To calculate ELO ratings, 'score' and 'oppo_score' must be " "in the data frame, but the columns given were " f"{list(data_frame.columns)}") elo_data_frame = data_frame.reorder_levels( [YEAR_LEVEL, ROUND_LEVEL, TEAM_LEVEL]).sort_index(ascending=True) elo_column = (reduce( partial(_calculate_match_elo_rating, elo_data_frame), elo_data_frame.iterrows(), None, ).reorder_levels( [REORDERED_TEAM_LEVEL, REORDERED_YEAR_LEVEL, REORDERED_ROUND_LEVEL]).sort_index()) return data_frame.assign(elo_rating=elo_column)
def import_foam_folder(path, search, files, skiplines=1, maxlines=0, skiptimes=1, exclude=None): """ returns a Dataframe for every file in fileList """ #import StringIO from pandas import concat fileList = find_datafiles(path, search=search, files=files, exclude=exclude) if not fileList: print("no files found") return p_bar = ProgressBar(n_tot=sum([len(l) for l in fileList.values()])) df = DataFrame() #df.index = MultiIndex.from_tuples(zip([],[]),names=['Loc',0]) from collections import defaultdict origins = Origins() els = list(fileList.items())[::skiptimes] for fullpath, files in els: time = strip_time(fullpath, path) df_tmp = DataFrame() for fn in files: #ret = read_table(StringIO.StringIO(foam_to_csv(fn))) ret = read_data_file(fn, skiplines, maxlines) p_bar.next() if not ret: continue field_names, x, hashes = ret loc = x.index.values[-1][0] if df_tmp.empty: df_tmp = x else: try: # use combine first for all df at existing Loc or # if not Loc is specified (Eul or Lag fields) if x.index.levels[0][0] in df_tmp.index.levels[0]: df_tmp = df_tmp.combine_first(x) #df_tmp = concat([df_tmp, x], axis=1) pass else: df_tmp = concat([df_tmp, x]) except Exception as e: print(x) print(e) field_names = ([field_names] if not type(field_names) == list else field_names) for field in field_names: origins.insert(time, loc, field, fn, hashes[field]) df_tmp['Time'] = time if df.empty: df = df_tmp else: df = df.append(df_tmp) df.set_index('Time', append=True, inplace=True) df = df.reorder_levels(['Time', 'Loc', 'Id']) p_bar.done() return origins, df
def normalize_df(target: DataFrame, normer: DataFrame, ind_sep: Optional[str] = "-", alphas: Optional[Iterable[float]] = None, cv: float = 5, **RidgeCV_kws) -> DataFrame: """ Used to normalize a dataset by another dataset, using a linear model with regularization chosen through cross validation (aka sklearn's RidgeCV). This is useful for normalizing, for example, RNA values by CNA, or phosphopeptide values by protein abundance. If target and normer dataframe row IDs (index) match 1:1, pass None for ind_sep. Args: target: Dataframe of values to normalize. Row IDs (index) before the sep (or whole ID if no sep) must match normer IDs. Row IDs must be unique. normer: Dataframe of values to use for normalization. Row IDs must match all or pre-ind_sep portions of target row IDs. Row IDs must be unique. ind_sep: If multiple rows in target map to 1 row in normer, the delimiter used to split the unique ID that matches the normer IDs. Defaul "-" alphas: Parameters to try for regulariztion. If None, tries powers of 2 from -10 to 10. cv: Fold for cross validation. Also the minimum number of non-null values for each row. Default 5 **RidgeCV_kws: kws to pass to sklearn's RidgeCV Returns: normed The target dataframe normalized by the normer dataframe. Only includes rows with sufficient non-null values from both dataframe. """ if not alphas: alphas = [2**i for i in range(-10, 10, 1)] normer = normer[[col for col in target.columns if col in normer.columns]] target = target[normer.columns] if (len(normer.columns) < cv) or (len(target.columns) < cv): raise KeyError( "target and normer dataframes do not have at least %s columns in common" % cv) target = target.transpose() target["col0"] = 0 target.set_index("col0", append=True, inplace=True) target = target.reorder_levels( [target.index.names[-1], target.index.names[0]]).transpose() normer = normer.transpose() normer["col0"] = 1 normer.set_index("col0", append=True, inplace=True) normer = normer.reorder_levels( [normer.index.names[-1], normer.index.names[0]]).transpose() target["gene"] = [i.split(ind_sep)[0] for i in target.index] target = target.loc[target["gene"].isin(normer.index), :] if len(target) == 0: raise KeyError("No rows in common between target and normer") logging.info( "Normalizing %s common rows and %s common samples between target and normer" % (len(target), len(normer.columns))) data = target.merge(normer, how="left", left_on="gene", right_index=True) model = lm.RidgeCV(alphas=alphas, cv=cv, **RidgeCV_kws) normed = data.apply( (lambda row: _convert_to_residuals(row[0], row[1], model)), axis=1) return normed
def __mul__(self, other: 'Conditional'): if not isinstance(other, Conditional): return other * self def expand_conditions(data: DataFrame, new_states: Dict[str, list]) -> DataFrame: """ Repeat the data for each state of the new_states dict. :param data: Original data. :param new_states: Dict mapping new variables to states. """ num_additional_states = np_product( [len(values) for _, values in new_states.items()]) data_width = data.shape[1] expanded_data = concat( [data for _ in range(num_additional_states)], axis=1) expanded_index = (list(data.columns.values) if isinstance( expanded_data.columns, MultiIndex) else [(x, ) for x in expanded_data.columns.values]) additional_index = list( chain.from_iterable( repeat(x, data_width) for x in list(product(*new_states.values())))) new_names = list(new_states.keys()) + list(data.columns.names) new_columns = [ tuple(chain(ai, xi)) for ai, xi in zip(additional_index, expanded_index) ] expanded_data.columns = MultiIndex.from_tuples(tuples=new_columns, names=new_names) return expanded_data # for each conditional that is only in one distribution, # replicate the other distribution for each state in that conditional self_conds = set(self._conditional_variables) other_conds = set(other._conditional_variables) if len(other_conds - self_conds) > 0: self_data = expand_conditions( self._data, { cond: other._states[cond] for cond in other_conds if cond not in self_conds }) else: self_data = self._data if len(self_conds - other_conds) > 0: other_data = expand_conditions( other._data, { cond: self._states[cond] for cond in self_conds if cond not in other_conds }) else: other_data = other._data # multiply joint variables as if it were a joint distribution results = {} for d1_states, d1_values in self_data.iterrows(): for d2_states, d2_values in other_data.iterrows(): if isinstance(d1_states, tuple): k1 = [x for x in d1_states] else: k1 = [d1_states] if isinstance(d2_states, tuple): k2 = [x for x in d2_states] else: k2 = [d2_states] key = tuple(k1 + k2) results[key] = d1_values * d2_values data = DataFrame(results).T data.index.names = (list(self_data.index.names) + list(other_data.index.names)) data = data.reorder_levels(sorted(data.index.names), axis=0) data = data.reorder_levels(sorted(data.columns.names), axis=1) new_joints = list(data.index.names) new_conds = list(data.columns.names) new_states = { variable: (self._states[variable] if variable in self._states.keys() else other._states[variable]) for variable in set(new_joints + new_conds) } return Conditional(data=data, joint_variables=new_joints, conditional_variables=new_conds, states=new_states)
def pivot_df( # pylint: disable=too-many-locals, too-many-arguments, too-many-statements, too-many-branches df: pd.DataFrame, rows: List[str], columns: List[str], metrics: List[str], aggfunc: str = "Sum", transpose_pivot: bool = False, combine_metrics: bool = False, show_rows_total: bool = False, show_columns_total: bool = False, apply_metrics_on_rows: bool = False, ) -> pd.DataFrame: metric_name = f"Total ({aggfunc})" if transpose_pivot: rows, columns = columns, rows # to apply the metrics on the rows we pivot the dataframe, apply the # metrics to the columns, and pivot the dataframe back before # returning it if apply_metrics_on_rows: rows, columns = columns, rows axis = {"columns": 0, "rows": 1} else: axis = {"columns": 1, "rows": 0} # pivot data; we'll compute totals and subtotals later if rows or columns: # pivoting with null values will create an empty df df = df.fillna("NULL") df = df.pivot_table( index=rows, columns=columns, values=metrics, aggfunc=pivot_v2_aggfunc_map[aggfunc], margins=False, ) else: # if there's no rows nor columns we have a single value; update # the index with the metric name so it shows up in the table df.index = pd.Index([*df.index[:-1], metric_name], name="metric") # if no rows were passed the metrics will be in the rows, so we # need to move them back to columns if columns and not rows: df = df.stack() if not isinstance(df, pd.DataFrame): df = df.to_frame() df = df.T df = df[metrics] df.index = pd.Index([*df.index[:-1], metric_name], name="metric") # combining metrics changes the column hierarchy, moving the metric # from the top to the bottom, eg: # # ('SUM(col)', 'age', 'name') => ('age', 'name', 'SUM(col)') if combine_metrics and isinstance(df.columns, pd.MultiIndex): # move metrics to the lowest level new_order = [*range(1, df.columns.nlevels), 0] df = df.reorder_levels(new_order, axis=1) # sort columns, combining metrics for each group decorated_columns = [(col, i) for i, col in enumerate(df.columns)] grouped_columns = sorted(decorated_columns, key=lambda t: get_column_key(t[0], metrics)) indexes = [i for col, i in grouped_columns] df = df[df.columns[indexes]] elif rows: # if metrics were not combined we sort the dataframe by the list # of metrics defined by the user df = df[metrics] # compute fractions, if needed if aggfunc.endswith(" as Fraction of Total"): total = df.sum().sum() df = df.astype(total.dtypes) / total elif aggfunc.endswith(" as Fraction of Columns"): total = df.sum(axis=axis["rows"]) df = df.astype(total.dtypes).div(total, axis=axis["columns"]) elif aggfunc.endswith(" as Fraction of Rows"): total = df.sum(axis=axis["columns"]) df = df.astype(total.dtypes).div(total, axis=axis["rows"]) # convert to a MultiIndex to simplify logic if not isinstance(df.index, pd.MultiIndex): df.index = pd.MultiIndex.from_tuples([(str(i), ) for i in df.index]) if not isinstance(df.columns, pd.MultiIndex): df.columns = pd.MultiIndex.from_tuples([(str(i), ) for i in df.columns]) if show_rows_total: # add subtotal for each group and overall total; we start from the # overall group, and iterate deeper into subgroups groups = df.columns for level in range(df.columns.nlevels): subgroups = {group[:level] for group in groups} for subgroup in subgroups: slice_ = df.columns.get_loc(subgroup) subtotal = pivot_v2_aggfunc_map[aggfunc](df.iloc[:, slice_], axis=1) depth = df.columns.nlevels - len(subgroup) - 1 total = metric_name if level == 0 else "Subtotal" subtotal_name = tuple([*subgroup, total, *([""] * depth)]) # insert column after subgroup df.insert(int(slice_.stop), subtotal_name, subtotal) if rows and show_columns_total: # add subtotal for each group and overall total; we start from the # overall group, and iterate deeper into subgroups groups = df.index for level in range(df.index.nlevels): subgroups = {group[:level] for group in groups} for subgroup in subgroups: slice_ = df.index.get_loc(subgroup) subtotal = pivot_v2_aggfunc_map[aggfunc]( df.iloc[slice_, :].apply(pd.to_numeric), axis=0) depth = df.index.nlevels - len(subgroup) - 1 total = metric_name if level == 0 else "Subtotal" subtotal.name = tuple([*subgroup, total, *([""] * depth)]) # insert row after subgroup df = pd.concat([ df[:slice_.stop], subtotal.to_frame().T, df[slice_.stop:] ]) # if we want to apply the metrics on the rows we need to pivot the # dataframe back if apply_metrics_on_rows: df = df.T return df
def import_logs(folder, search, keys): """ keys = {"ExectionTime": ["ExecTime", "ClockTime"]} return a DataFrame Loc, Time KeyName1 Keyname2 1 0.1 0.2 2 """ def find_start(log): """ Fast forward through file till 'Starting time loop' """ for i, line in enumerate(log): if "Starting time loop" in line: return i def extract(line, keys): """ returns key and values as list "ExecutionTime":[0,1] """ import re for key, col_names in keys.items(): if re.search(key, line): return col_names, list( map(float,filter(lambda x: x, re.findall("[0-9]+[.]?[0-9]*[e]?[\-]?[0-9]*", line)))) return None, None fold, dirs, files = next(os.walk(folder)) logs = [fold + "/" + log for log in files if search in log] p_bar = ProgressBar(n_tot = len(logs)) # Lets make sure that we find Timesteps in the log keys.update({"^Time = ": ['Time']}) for log_number, log_name in enumerate(logs): with open(log_name, encoding="utf-8") as log: f = log.readlines() start = find_start(f) dataDict = defaultdict(list) df=DataFrame() for line in f[start:-1]: col_names, values = extract(line, keys) if not col_names: continue if col_names[0] == 'Time': # a new time step has begun # flush datadict and concat to df # Very slow but, so far the solution # to keep subiterations attached to correct time # FIXME: still needs handling of different length dictionaries df = concat([df, DataFrame(dataDict)]) dataDict = defaultdict(list) for i, col in enumerate(col_names): dataDict[col].append(values[i]) p_bar.next() try: df.index=range(len(df)) df.index.names=['Id'] df['Loc'] = log_number df.set_index('Time', append=True, inplace=True) df.set_index('Loc', append=True, inplace=True) df = df.reorder_levels(['Loc','Time','Id']) p_bar.done() except Exception as e: print(log_name) print("failed to process") print(e) return {}, None return {}, DataFrame()
def read_data_file(fn, skiplines=1, maxlines=False, p_bar=None): """ A function to read any foam data files returning data and index after header """ # TODO check if sorting the index gives any performance benefits # print "opening file {}".format(fn) if not os.path.exists(fn): print("Can not open file " + fn) return None try: with open(fn, encoding="utf-8") as f: field = fn.split('/')[-1] content = f.readlines() content.append('bla') start, num_entries = if_header_skip(content) entries = len(content[start].split()) is_a_vector = (True if entries > 1 else False) end = start + num_entries # FIXME this fails for eulerian/lagrangian vector fields # since no positional entry is produced if isinstance(p_bar, ProgressBar): p_bar.next() if is_a_vector: data = list(map(lambda x: re.sub("[0-9]*\(|\)", '', x).split(), content[start:end:skiplines])) loc, names = evaluate_names(fn, entries) df = DataFrame(data=data, columns=names) if loc: df['Loc'] = loc else: df['Loc'] = range(len(df)) if "Pos" in df: df.set_index('Loc', append=False, inplace=True) df["Pos"] = df["Pos"].astype(float) df.set_index('Pos', append=True, inplace=True) else: # if no pos is availible we have either # an eulerian or lagrangian field df.set_index('Loc', append=True, inplace=True) df.index.names = ['Pos', 'Loc'] df = df.reorder_levels(['Loc', 'Pos']) df = df.astype(float) hashes = {} for row in df.columns: hashes.update({row: hash_series(df[row])}) return names, df, hashes # DataFile with a single row are seen as Eulerian or Lagrangian fields else: data = [np.float32(x) for x in content[start:end:skiplines]] entries = 1 df = DataFrame(data=data, columns=[field]) df['Loc'] = "Field" df.set_index('Loc', append=True, inplace=True) df.index.names=['Pos', 'Loc'] df = df.reorder_levels(['Loc', 'Pos']) if HASH_RESULTS: hashes = {field: int(hashlib.md5(str(data).encode('utf-8')).hexdigest(),16)} else: hashes = {field: 0} return field, df, hashes except Exception as e: if DEBUG: print("Error processing datafile " + fn) print(e) return None
def import_foam_folder( path, search, files, skiplines=1, maxlines=0, skiptimes=1, exclude=None, times_slice=None, ): """ returns a Dataframe for every file in fileList """ #import StringIO fileList = find_datafiles( path, search=search, files=files, exclude=exclude, times_slice=times_slice ) if not fileList: print("no files found") return None, DataFrame() p_bar = ProgressBar(n_tot=sum([len(l) for l in fileList.values()])) df = DataFrame() #df.index = MultiIndex.from_tuples(zip([],[]),names=['Loc',0]) from collections import defaultdict origins = Origins() els = list(fileList.items())[::skiptimes] for fullpath, files in els: time = strip_time(fullpath, path) df_tmp = DataFrame() for fn in files: #ret = read_table(StringIO.StringIO(foam_to_csv(fn))) ret = read_data_file(fn, skiplines, maxlines) p_bar.next() if not ret or ret[1].empty: continue field_names, x, hashes = ret loc = x.index.values[-1][0] if df_tmp.empty: df_tmp = x else: try: df_tmp = df_tmp.combine_first(x) except Exception as e: print("failed to concat: ", df_tmp, "and", x, "new_loc ", x.index.levels[0][0], " existing_locs ", df_tmp.index.levels[0] ) print(e) field_names = ([field_names] if not type(field_names) == list else field_names) for field in field_names: if field == "Pos": continue origins.insert(time, loc, field, fn, hashes[field]) df_tmp['Time'] = time if df.empty: df = df_tmp else: df = df.append(df_tmp) df.set_index('Time', append=True, inplace=True) if not "Loc" in df.index.names: print(df) # df = df.reorder_levels(['Time', ]) else: df = df.reorder_levels(['Time', 'Loc', 'Pos']) p_bar.done() return origins, df
def reorder_pairs(pair_df: pd.DataFrame, num_stars: int, parameters: Dict[Union[str, int], Tuple[Union[float, str]]], df: pd.DataFrame, delta_h: float, report_verbose: bool, xmax: np.ndarray, xmin: np.ndarray, offline_mode: bool ) -> pd.DataFrame: """ Calculates the differences('h') between the pairings of the star points, and bins and reorders the pair dataframe according to the calculated 'h' values Parameters ---------- pair_df : pd.DataFrame Pandas DataFrame containing the paired star points values with the model outputs num_stars : int number of star samples parameters : dictionary dictionary containing parameter names and their attributes df : pd.DataFrame Pandas DataFrame containing the star points, and model outputs delta_h : float resolution of star samples report_verbose : boolean if True will use a loading bar when generating stars, does nothing if False xmax : arraylike array containing max boundary of each parameter xmin : arraylike array containing min boundary of each parameter offline_mode : boolean if True GVARS analysis is in offline mode, if False it is in online mode Returns ------- pair_df : array_like the returned dataframe of paired values """ # for loading bar when calculating differences in values 'h' if report_verbose: star_centres = tqdm(range(0, num_stars), desc='calculating \'h\' values') else: star_centres = range(0, num_stars) # gather the actual 'h' differences between each star point value for every pair # possibly find a faster way to do this later dist_list = [] for star_centre in star_centres: param_num = 0 for param in parameters.keys(): # check for offline on online mode as index changes for df if offline_mode: pairs = pairs_h(df.loc[star_centre, param][str(param_num)].index.get_level_values(-1)) else: pairs = pairs_h(df.loc[star_centre, param][param_num].index.get_level_values(-1)) for ignore, idx in pairs.items(): for idx_tup in idx: if offline_mode: dist_list.append(np.abs((df.loc[star_centre, param][str(param_num)][idx_tup[0]] - df.loc[star_centre, param][str(param_num)][idx_tup[1]]) / ( xmax[param_num] - xmin[param_num]))) else: dist_list.append(np.abs((df.loc[star_centre, param][param_num][idx_tup[0]] - df.loc[star_centre, param][param_num][idx_tup[1]]) / ( xmax[param_num] - xmin[param_num]))) param_num += 1 # loading bar for binning and reording pairs based on new 'h' values if report_verbose: pairs_pbar = tqdm(desc='binning and reording pairs based on \'h\' values', total=2, dynamic_ncols=True) # add new distances to dataframe pair_df['actual h'] = dist_list # create bin ranges num_bins = int(1 / delta_h) # the number of bins created by delta h bins = np.zeros(num_bins + 1) bins[1:] = np.arange(start=delta_h / 2, step=delta_h, stop=1) # create middle bin ranges # create labels for the bin ranges which will be the actual delta h values labels = np.zeros(num_bins) labels[0] = delta_h / 4 labels[1:] = np.arange(start=delta_h, step=delta_h, stop=1) # bin pair values according to their distances 'h' for each paramter at each star centre binned_pairs = [] for star_centre in range(0, num_stars): for param in parameters.keys(): binned_pairs.append( pd.cut(pair_df.loc[star_centre, param, :]['actual h'], bins=bins, labels=labels).sort_values()) # put binned pairs into a panda series binned_pairs = pd.concat(binned_pairs, ignore_index=False) if report_verbose: pairs_pbar.update(1) # re order pairs values according to the bins pair_df = pair_df.loc[binned_pairs.index] # add in new index h, according to bin ranges # ex.) h = 0.1 = [0-0.15], h = 0.2 = [0.15-0.25] h = list(binned_pairs.values) pair_df['h'] = h # format data frame so that it works properly with variogram analsysis functions pair_df.set_index('h', append=True, inplace=True) pair_df.set_index('actual h', append=True, inplace=True) pair_df = pair_df.reorder_levels(['centre', 'param', 'h', 'actual h', 'pair_ind']) if report_verbose: pairs_pbar.update(1) pairs_pbar.close() return pair_df
def import_foam_folder( path, search, files, skiplines=1, maxlines=0, skiptimes=slice(0,None), exclude=None, times_slice=None, ): """ returns a Dataframe for every file in fileList """ #import StringIO fileList = find_datafiles( path, search=search, files=files, exclude=exclude, times_slice=times_slice ) if not fileList: print("no files found") return None, DataFrame() p_bar = ProgressBar(n_tot=sum([len(l) for l in fileList.values()])) df = DataFrame() #df.index = MultiIndex.from_tuples(zip([],[]),names=['Loc',0]) from collections import defaultdict origins = Origins() els = list(fileList.items())[skiptimes] for fullpath, files in els: time = strip_time(fullpath, path) df_tmp = DataFrame() # for fn in files: # #ret = read_table(StringIO.StringIO(foam_to_csv(fn))) # ret = read_data_file(fn, skiplines, maxlines) # p_bar.next() args = [(fn, skiplines, maxlines, p_bar) for fn in files] if MULTIPROCESS: with multiprocessing.Pool(processes=MULTIPROCESS) as pool: rets = pool.map(read_data_file_args, args) else: rets = map(read_data_file_args, args) for fn, ret in zip(files, rets): if not ret or ret[1].empty: continue field_names, x, hashes = ret loc = x.index.values[-1][0] if df_tmp.empty: df_tmp = x else: try: df_tmp = df_tmp.combine_first(x) except Exception as e: print("failed to concat: ", df_tmp, "and", x, "new_loc ", x.index.levels[0][0], " existing_locs ", df_tmp.index.levels[0] ) print(e) field_names = ([field_names] if not type(field_names) == list else field_names) for field in field_names: if field == "Pos": continue origins.insert(time, loc, field, fn, hashes[field]) df_tmp['Time'] = time if df.empty: df = df_tmp else: df = df.append(df_tmp) df.set_index('Time', append=True, inplace=True) if not "Loc" in df.index.names: print(df) # df = df.reorder_levels(['Time', ]) else: df = df.reorder_levels(['Time', 'Loc', 'Pos']) p_bar.done() return origins, df
def import_foam_folder( path, search, files, skiplines=1, maxlines=0, skiptimes=1, exclude=None ): """ returns a Dataframe for every file in fileList """ #import StringIO from pandas import concat fileList = find_datafiles( path, search=search, files=files, exclude=exclude) if not fileList: print("no files found") return p_bar = ProgressBar(n_tot=sum([len(l) for l in fileList.values()])) df = DataFrame() #df.index = MultiIndex.from_tuples(zip([],[]),names=['Loc',0]) from collections import defaultdict origins = Origins() els = list(fileList.items())[::skiptimes] for fullpath, files in els: time = strip_time(fullpath, path) df_tmp = DataFrame() for fn in files: #ret = read_table(StringIO.StringIO(foam_to_csv(fn))) ret = read_data_file(fn, skiplines, maxlines) p_bar.next() if not ret: continue field_names, x, hashes = ret loc = x.index.values[-1][0] if df_tmp.empty: df_tmp = x else: try: # use combine first for all df at existing Loc or # if not Loc is specified (Eul or Lag fields) if x.index.levels[0][0] in df_tmp.index.levels[0]: df_tmp = df_tmp.combine_first(x) #df_tmp = concat([df_tmp, x], axis=1) pass else: df_tmp = concat([df_tmp, x]) except Exception as e: print(x) print(e) field_names = ([field_names] if not type(field_names) == list else field_names) for field in field_names: origins.insert(time, loc, field, fn, hashes[field]) df_tmp['Time'] = time if df.empty: df = df_tmp else: df = df.append(df_tmp) df.set_index('Time', append=True, inplace=True) df = df.reorder_levels(['Time','Loc','Id']) p_bar.done() return origins, df