def dfLastLacFrecuencia(self, df: pd.DataFrame): groupedDf = df.groupby('LAST_LAC')['LAST_LAC'].agg(FRECUENCIA=pd.NamedAgg(column='LAST_LAC', aggfunc='size')) groupedDf['LAST_LAC'] = groupedDf.index groupedDf = groupedDf[['LAST_LAC', 'FRECUENCIA']] return groupedDf.sort_values(ascending=False, by='FRECUENCIA')
def __init__(self, load_file_name: str, features_files_dict: dict, total_payoff_label: bool=True, features_file_list: list=list(), use_all_history: bool = False, label: str='total_payoff', use_all_history_text_average: bool = False, use_all_history_text: bool=False, use_all_history_average: bool = False, use_prefix_suffix_setting: bool=False, features_to_drop: list=None, suffix_average_text: bool=False, no_suffix_text: bool=False, non_nn_turn_model: bool=False, transformer_model: bool=False, prefix_data_in_sequence: bool=False, data_type='train_data', no_decision_features: bool=False, suffix_no_current_round_average_text: bool=False): """ :param load_file_name: the raw data file name :param features_files_dict: dict of features files and types :param total_payoff_label: if the label is the total payoff of the expert or the next rounds normalized payoff :param label: the name of the label :param features_file_list: if using fix features- the name of the features file :param use_all_history: if to add some numeric features regarding the history decisions and lottery :param use_all_history_average: if to add some numeric features regarding the history decisions and lottery as average over the history :param use_all_history_text: if to use all the history text features :param use_all_history_text_average: if to use the history text as average over all the history :param use_prefix_suffix_setting: if we create data for crf model with fixed prefix :param features_to_drop: a list of features to drop :param suffix_average_text: if we want to add the suffix average text features :param no_suffix_text: if we don't want to use the text of the suffix rounds :param non_nn_turn_model: non neural networks models that predict a label for each round :param transformer_model: create data for transformer model --> create features for prefix rounds too :param prefix_data_in_sequence: if the prefix data is not in the suffix features but in the seq :param no_decision_features: if we want to check models without decision features :param suffix_no_current_round_average_text: if we want the average of all suffix text """ print(f'Start create and save data for file: {os.path.join(data_directory, f"{load_file_name}_{data_type}.csv")}') logging.info('Start create and save data for file: {}'. format(os.path.join(data_directory, f'{load_file_name}_{data_type}.csv'))) self.data = pd.read_csv(os.path.join(data_directory, f'{load_file_name}_{data_type}.csv')) # , usecols=columns_to_use) print(f'Number of rows in data: {self.data.shape[0]}') self.data = self.data.loc[(self.data.status == 'play') & (self.data.player_id_in_group == 2)] print(f'Number of rows in data: {self.data.shape[0]} after keep only play and decision makers') self.data = self.data.drop_duplicates() print(f'Number of rows in data: {self.data.shape[0]} after drop duplicates') # get manual text features reviews_features_files_list = list() print(f'Load features from: {features_file_list}') for features_file in features_file_list: features_file_type = features_files_dict[features_file] if features_file_type == 'pkl': reviews_features_files_list.append(joblib.load(os.path.join( data_directory, f'{features_file}_{data_type}.{features_file_type}'))) elif features_file_type == 'xlsx': features = pd.read_excel(os.path.join( data_directory, f'{features_file}_{data_type}.{features_file_type}')) if data_type == 'test_data': # change order to be the same as in the train data train_features = pd.read_excel( os.path.join(data_directory, f'{features_file}_train_data.{features_file_type}')) features = features[train_features.columns] reviews_features_files_list.append(features) else: print('Features file type is has to be pkl or xlsx') return # get manual text features for index, reviews_features_file in enumerate(reviews_features_files_list): if 'review' in reviews_features_file: reviews_features_file = reviews_features_file.drop('review', axis=1) if 'score' in reviews_features_file: reviews_features_file = reviews_features_file.drop('score', axis=1) if reviews_features_file.shape[1] == 2: # Bert features -> flat the vectors reviews = pd.DataFrame() for i in reviews_features_file.index: temp = pd.DataFrame(reviews_features_file.at[i, 'review_features']).append( pd.DataFrame([reviews_features_file.at[i, 'review_id']], index=['review_id'])) reviews = pd.concat([reviews, temp], axis=1, ignore_index=True) reviews_features_files_list[index] = reviews.T else: # manual features if features_to_drop is not None: reviews_features_files_list[index] = reviews_features_file.drop(features_to_drop, axis=1) if len(reviews_features_files_list) == 1: self.reviews_features = reviews_features_files_list[0] elif len(reviews_features_files_list) == 2: self.reviews_features = reviews_features_files_list[0].merge(reviews_features_files_list[1], on='review_id') else: print(f"Can't create reviews features with {len(reviews_features_files_list)} feature types") # calculate expert total payoff --> the label self.data['exp_payoff'] = self.data.group_receiver_choice.map({1: 0, 0: 1}) total_exp_payoff = self.data.groupby(by='pair_id').agg( total_exp_payoff=pd.NamedAgg(column='exp_payoff', aggfunc=sum)) self.data = self.data.merge(total_exp_payoff, how='left', right_index=True, left_on='pair_id') self.data['10_result'] = np.where(self.data.group_lottery_result == 10, 1, 0) self.data = self.data[['pair_id', 'total_exp_payoff', 'subsession_round_number', 'group_sender_answer_reviews', 'exp_payoff', 'group_lottery_result', 'review_id', 'previous_round_lottery_result', 'previous_round_decision', 'group_average_score', 'lottery_result_low', 'lottery_result_med1', 'previous_round_lottery_result_low', 'previous_round_lottery_result_high', 'previous_average_score_low', 'previous_average_score_high', 'previous_round_lottery_result_med1', 'group_sender_payoff', 'lottery_result_high', 'chose_lose', 'chose_earn', 'not_chose_lose', 'not_chose_earn', 'previous_score', 'group_sender_answer_scores', '10_result']] # 'time_spent_low', 'time_spent_high', self.final_data = pd.DataFrame() self.pairs = pd.Series(self.data.pair_id.unique()) self.total_payoff_label = total_payoff_label self.label = label self.number_of_rounds = 10 self.features_file_list = features_file_list self.use_all_history = use_all_history self.use_all_history_average = use_all_history_average self.use_all_history_text_average = use_all_history_text_average self.use_all_history_text = use_all_history_text self.suffix_average_text = suffix_average_text self.suffix_no_current_round_average_text = suffix_no_current_round_average_text self.no_suffix_text = no_suffix_text self.non_nn_turn_model = non_nn_turn_model self.transformer_model = transformer_model self.prefix_data_in_sequence = prefix_data_in_sequence self.decisions_payoffs_columns = ['exp_payoff', 'lottery_result_high', 'lottery_result_low', 'lottery_result_med1', 'chose_lose', 'chose_earn', 'not_chose_lose', 'not_chose_earn'] if no_decision_features: self.decisions_payoffs_columns = list() print(f'Number of pairs in data: {self.pairs.shape}') self.history_columns = list() if self.use_all_history_average: self.set_all_history_average_measures() # create file names: file_name_component = [f'{self.label}_label_', 'prefix_suffix_' if use_prefix_suffix_setting else '', 'non_nn_turn_model_' if self.non_nn_turn_model else '', 'transformer_' if self.transformer_model else '', f'all_history_features_' if self.use_all_history else '', f'all_history_features_avg_with_global_alpha_{alpha_global}_' if self.use_all_history_average else '', f'all_history_text_avg_with_alpha_{alpha_text}_' if self.use_all_history_text_average else '', 'prefix_in_seq_' if prefix_data_in_sequence else '', f'no_suffix_text_' if self.no_suffix_text else '', f'all_suffix_text_average_' if self.suffix_average_text else '', f'all_history_text_' if self.use_all_history_text else '', f'{self.features_file_list}_', 'no_decision_features_' if no_decision_features else 'use_decision_features_', f'{condition}_{data_type}'] self.base_file_name = ''.join(file_name_component) print(f'Create data for: {self.base_file_name}') return
import pandas as pd df_dummy = pd.DataFrame( dict(id=[1, 1, 2, 2, 3, 3, 3], values=[3, 5, 6, 7, 8, 9, 15])) df_stats = (df_dummy.groupby(["id"]).agg( count=pd.NamedAgg(column="values", aggfunc="count"), sum=pd.NamedAgg(column="values", aggfunc="sum"), max=pd.NamedAgg(column="values", aggfunc="max"), ).reset_index().assign( pct_value=lambda df: round(100 * df["sum"] / sum(df["sum"]), 2))) # To double check - you might sample a column df_temp = df_dummy.loc[lambda x: x["id"] == 1][["values"]] df_temp.sum().values df_temp.max().values # Or you would do this: df_check = pd.DataFrame({ "id": { 0: 1, 1: 2, 2: 3 }, "count": { 0: 2, 1: 2, 2: 3 },
df['EarlyDeliveryDate'] = pd.to_datetime(df['EarlyDeliveryDate']) df['ReceivedDate'] = pd.to_datetime(df['ReceivedDate']) #df.info() df['DeliveryTime'] = [x/10 if x>0 else x for x in df.DeliveryTime] df['NormDeliveryTime'] = 0-df.DeliveryTime.abs() scale = MinMaxScaler() df['NormDeliveryTime'] = scale.fit_transform(df[['NormDeliveryTime']]) df['PercentKept'] = 1-df.PercentOfQuantityReturned df['ReceivedMonth'] = df['ReceivedDate'].dt.month df['ReceivedYear'] = df['ReceivedDate'].dt.year grouped_monthly = df.groupby(['ReceivedYear', 'ReceivedMonth', 'Vendor']).agg(MonthlyNormDeliveryTime = pd.NamedAgg(column = 'NormDeliveryTime', aggfunc='mean'), MonthlyPercentReceived = pd.NamedAgg(column = 'PercentOfQuantityReceived', aggfunc='mean'), MonthlyPercentKept = pd.NamedAgg(column = 'PercentKept', aggfunc='mean'), VendorId = pd.NamedAgg(column = 'Vendor', aggfunc='first')) """*Categorization* of orders into performing and non-performing along with the reason. (Promptness, Quantity, Quality)""" cls = KMeans(n_clusters = 4) cls_assignment = cls.fit_predict(grouped_monthly[['MonthlyPercentReceived','MonthlyNormDeliveryTime','MonthlyPercentKept']]) grouped_monthly['label'] = cls_assignment grouped = grouped_monthly.groupby("VendorId") vendor_output = pd.DataFrame(columns=['Vendor_ID', 'Performance', 'Performance_Percent','UnderPerformance (Quality)', 'UnderPerformance (Quantity)', 'UnderPerformance (Promptness)']) for name, group in grouped: values = group['label'].value_counts()
features = [feat for _, feat in feature_names_to_columns] for category, entry_name in feature_names_to_columns: unique_count = corpus[entry_name].unique().shape[0] print(f"Number of unique {category} : {unique_count}") # Remove all data with categories which are too rare (have less than a predefined frequency) for feature in ["pos", "pattern"]: print(f"{feature} category frequency : ") freq = corpus.groupby(feature).count().morpheme / len(corpus) print((freq * 100).to_string()) low_freq = freq[freq < 0.2].dropna().index corpus = corpus[~corpus[feature].isin(low_freq)] num_roots_per_morpheme = corpus[["morpheme", "root"]].drop_duplicates().groupby("morpheme").agg( count=pd.NamedAgg(column="morpheme", aggfunc='count')) for num_roots in num_roots_per_morpheme["count"].unique(): unique_count = len(num_roots_per_morpheme[num_roots_per_morpheme["count"] == num_roots]) percent = int(unique_count / len(corpus["morpheme"].unique()) * 100) print(f"Number of morphemes with {num_roots} distinct roots : {unique_count} ({percent}%)") special_radicals_map = { 1: ("i", "w", "n"), 2: ("i", "w", "h"), 3: ("h", "i") } special_roots_df_lst = [] for rad_i, special_vals in special_radicals_map.items(): for ch in special_vals: ri_special = corpus[corpus.root.str[rad_i - 1] == ch] special_roots_df_lst.append(ri_special)
def get_window_features(self, campdate_df): """ Finds meta data of each ANM in each time window Parameters ---------- campdate_df : dataframe preprocessed dataframe """ grouped = campdate_df.groupby(['sub_center_id', 'camp_id']) bp = grouped.agg( Tot_num_patients=pd.NamedAgg(column='ANC_Mother Id', aggfunc=lambda x: len(list(x))), Num_camps=pd.NamedAgg(column='cluster_date', aggfunc=lambda x: len(set(x))), dates=pd.NamedAgg(column='cluster_date', aggfunc=lambda x: list(set(x)))).reset_index() anmgrouped = bp.groupby(['sub_center_id']) bpanm = anmgrouped.agg( Tot_num_patients=pd.NamedAgg(column='Tot_num_patients', aggfunc=sum), Num_camps=pd.NamedAgg(column='Num_camps', aggfunc=sum), Num_locations=pd.NamedAgg(column='camp_id', aggfunc=lambda x: len(list(x))), dates=pd.NamedAgg(column='dates', aggfunc=lambda x: list(x))).reset_index() dates_diff = np.array([]) for i in range(len(bpanm)): num_camps = bpanm.loc[i, 'Num_camps'] if num_camps < 2: dates_diff = np.append(dates_diff, 28) else: anm_dates = bpanm.loc[i, 'dates'] flat_list = [item for sublist in anm_dates for item in sublist] flat_list.sort() anm_diff = np.array([]) for j in range(1, len(flat_list)): diff = (flat_list[j] - flat_list[j - 1]).days anm_diff = np.append(anm_diff, diff) dates_diff = np.append(dates_diff, np.mean(anm_diff)) dates_diff = pd.Series(dates_diff) bpanm = bpanm.assign(dates_diff=dates_diff.values) # merge with dummy dataframe df_merge = pd.merge(self.allANMdf, bpanm, on='sub_center_id', how='left') t = df_merge['Tot_num_patients'].isna().sum() df_merge['Tot_num_patients'].fillna(0, inplace=True) df_merge['Num_camps'].fillna(0, inplace=True) df_merge['dates_diff'].fillna(0, inplace=True) df_merge['Num_locations'].fillna(0, inplace=True) window_features = np.asarray([ np.asarray(df_merge['Tot_num_patients']), np.asarray(df_merge['Num_camps']), np.asarray(df_merge['dates_diff']), np.asarray(df_merge['Num_locations']) ]) return np.transpose(window_features), t
def sub_matrix(bcf_stats, header=_sub_header, report=None): """Create a report section with a base substitution matrix. :param bcf_stats: one or more outputs from `bcftools stats`. :param header: a markdown formatted header. :param report: an HTMLSection instance. :returns: an HTMLSection instance, if `report` was provided the given instance is modified and returned. """ report = _maybe_new_report(report) report.markdown(header) sim_sub = { 'G>A': 'C>T', 'G>C': 'C>G', 'G>T': 'C>A', 'T>A': 'A>T', 'T>C': 'A>G', 'T>G': 'A>C' } def canon_sub(sub): b1 = sub[0] if b1 not in {'A', 'C'}: return canon_sub(sim_sub[sub]) else: return b1, sub[2] df = bcf_stats['ST'] df['canon_sub'] = df['type'].apply(canon_sub) df['original'] = df['canon_sub'].apply(lambda x: x[0]) df['substitution'] = df['canon_sub'].apply(lambda x: x[1]) df['count'] = df['count'].astype(int) df = df[['original', 'substitution', 'count']] \ .groupby(['original', 'substitution']) \ .agg(count=pd.NamedAgg(column='count', aggfunc='sum')) \ .reset_index() colors = Blues9[::-1] mapper = LinearColorMapper(palette=colors, low=min(df['count']), high=max(df['count'])) p = figure(y_range=['C', 'A'], x_range=['A', 'C', 'G', 'T'], x_axis_location="above", x_axis_label='alternative base', y_axis_label='reference base', tools="save", toolbar_location='below', output_backend="webgl", height=225, width=300, tooltips=[('sub', '@original>@substitution'), ('count', '@count')]) p.grid.grid_line_color = None p.axis.axis_line_color = None p.axis.major_tick_line_color = None p.rect(source=df, y="original", x="substitution", width=1, height=1, fill_color={ 'field': 'count', 'transform': mapper }, line_color=None) report.plot(p) return report
def main_individualData(data_filename, headers_filename, output_filename, ascending=True, top_n=None): """Run main script Expected input headers are the following: indices: column with indices (discrete) values metrics: column with metric (continuous) values sort: columns to sort by scale: columns to project/transform to the range 0 to 1 """ # read in the headers data_used = pd.read_csv(headers_filename) index_columns = list(data_used.loc[:, "indices"].dropna()) metric_columns = list(data_used.loc[:, "metrics"].dropna()) sort_columns = list(data_used.loc[:, "sort"].dropna()) scale_columns = list(data_used.loc[:, "scale"].dropna()) headers = list(set(chain(index_columns, metric_columns, sort_columns))) # read in the data df = pd.read_csv(data_filename, usecols=headers) # make the empty data frame for the aggregate statistics metric_means = [item + "_mean" for item in metric_columns] metric_errors = [item + "_error" for item in metric_columns] metric_sort = [item + "_mean" for item in sort_columns] agg_stats = pd.DataFrame(columns=index_columns) # calculate the aggregated statistics (average and standard deviation) for metric in list(set(chain(metric_columns, sort_columns))): # scale the data if (metric in scale_columns): df[metric] = df[metric].transform(lambda x: (x - x.min()) / (x.max() - x.min())) # aggregate statistics df_tmp = df.groupby(index_columns).agg( metric_means=pd.NamedAgg(column=metric, aggfunc="mean"), metric_errors=pd.NamedAgg(column=metric, aggfunc="std")) df_tmp = df_tmp.rename(columns={ "metric_means": metric + "_mean", "metric_errors": metric + "_error" }) agg_stats = agg_stats.merge(df_tmp, on=index_columns, how="outer") # sort and filter agg_stats = agg_stats.sort_values( by=metric_sort, axis=0, ascending=ascending).reset_index(drop=True) if top_n: agg_stats = agg_stats.head(top_n) # make the plot fig = dicrete_plot(agg_stats, index_columns, metric_means, metric_errors, scale=1.5, legend_offset=0.5, capsize=5) # Export to svg file: fig.savefig(output_filename)
df.head() df.info() df['PercentKept'] = 1-df.PercentOfQuantityReturned df.head() import datetime as dt df['ReceivedMonth'] = df['ReceivedDate'].dt.month df['ReceivedYear'] = df['ReceivedDate'].dt.year grouped_monthly = df.groupby(['ReceivedYear', 'ReceivedMonth', 'Vendor']).agg(MonthlyNormDeliveryTime = pd.NamedAgg(column = 'NormDeliveryTime', aggfunc='mean'), MonthlyPercentReceived = pd.NamedAgg(column = 'PercentOfQuantityReceived', aggfunc='mean'), MonthlyPercentKept = pd.NamedAgg(column = 'PercentKept', aggfunc='mean'), VendorId = pd.NamedAgg(column = 'Vendor', aggfunc='first')) grouped_monthly.head(60) import numpy as np from numpy import linalg as LA def Rater(Alternatives): length=Alternatives.shape[0] matrix=np.ones((length,length)) for i in range(length):
'Balance': 'mean' }) df_summary.rename(columns={ 'Exited': '# of churned customers', 'Balance': 'Average Balance of Customers' }, inplace=True) df_summary # Alternative # The NamedAgg function allows renaming the columns in the aggregation. The syntax is as follows: df_summary_1 = df[['Geography', 'Exited', 'Balance']].groupby('Geography').agg( Number_of_churned_customers=pd.NamedAgg('Exited', 'sum'), Average_balance_of_customers=pd.NamedAgg('Balance', 'mean')) # 15. Reset the index # The index of the dataframes that the groupby returns consist of the group names. We can change it by resetting the index. df_new = df[['Geography', 'Exited', 'Balance']].groupby(['Geography', 'Exited']).mean().reset_index() df_new # 16. Reset the index with a drop # In some cases, we need to reset the index and get rid of the original index at the same time. # Consider a case where draw a sample from a dataframe. The sample will keep the index of the # original dataframe so we want to reset it.
def sanitize_blast_data(data: pd.DataFrame, queries: pd.DataFrame, targets: pd.DataFrame, qmult=3, tmult=1): if data[data.btop.isna()].shape[0] > 0: raise ValueError( f"BTOP not present in the tabular file: please rerun BLAST with the correct blast_keys: {blast_keys}" ) data["qseqid"] = data["qseqid"].str.replace(id_pattern, "\\1") data["sseqid"] = data["sseqid"].str.replace(id_pattern, "\\1") assert "qid" in queries.columns or "qid" == queries.index.name, queries.head( ) assert "sid" in targets.columns or "sid" == targets.index.name, targets.head( ) data = data.join(queries, on=["qseqid"]).join(targets, on=["sseqid"]).join( data.groupby(["qseqid", "sseqid"]).agg( min_evalue=pd.NamedAgg("evalue", np.min), max_bitscore=pd.NamedAgg("bitscore", np.max))[["min_evalue", "max_bitscore"]], on=["qseqid", "sseqid"]) # Check the joins were successful if any(data.qid.isna()): raise KeyError( "The tabular file passed in and the queries in the database differ. Please rerun BLAST and " "Mikado serialise with the correct query file.") if any(data.sid.isna()): print(targets.head()) print() print(data["sseqid"].head()) print() raise KeyError( "The tabular file passed in and the targets in the database differ. Please rerun BLAST and " "Mikado serialise with the correct target file.") for col in ["qstart", "qend", "sstart", "send", "qlength", "slength"]: if col != "slength": err_val = (col, data[data[col].isna()].shape[0], data.shape[0]) else: err_val = (col, data[["sseqid"]].head()) if data[col].isna().any(): raise ValueError( "Column {col} contains {nnan} NaN values out of {tot}. Head: {err_val}\ Please make sure you have run BLAST asking for the following fields:\ {blast_keys}".format(col=col, nnan=np.where(data[col].isna())[0].shape[0], tot=data.shape[0], err_val=err_val, blast_keys=blast_keys)) try: data[col] = data[col].astype(int).values except ValueError as exc: raise ValueError("{}: {}".format(exc, col)) for key, multiplier, (start, end), length in [ ("query_frame", qmult, ("qstart", "qend"), "qlength"), ("target_frame", tmult, ("sstart", "send"), "slength") ]: # Switch start and end when they are not in the correct order _ix = (data[start] > data[end]) if multiplier > 1: data.loc[~_ix, key] = data[start] % multiplier data.loc[_ix, key] = -((data[length] - data[end] - 1) % multiplier) data.loc[(data[key] == 0) & ~_ix, key] = multiplier data.loc[(data[key] == 0) & _ix, key] = -multiplier else: data.loc[:, key] = 0 data.loc[_ix, [start, end]] = data.loc[_ix, [end, start]].values data[start] -= 1 # Get the minimum evalue for each group # data["aln_span"] = data.qend - data.qstart # Set the hsp_num data["sstart"] = data["sstart"].astype(int).values data["hsp_num"] = data.sort_values("bitscore", ascending=False).groupby( ["qseqid", "sseqid"]).cumcount() + 1 temp = data[["qseqid", "sseqid", "max_bitscore" ]].drop_duplicates().sort_values(["max_bitscore", "sseqid"], ascending=[False, True]) temp["hit_num"] = temp.groupby(["qseqid"]).cumcount() + 1 temp.set_index(["qseqid", "sseqid"], inplace=True) data = data.join(temp["hit_num"], on=["qseqid", "sseqid"]) data = data.sort_values(["qid", "sid"]) data.set_index(["qid", "sid"], drop=False, inplace=True) return data
def colorscheme_by_site( colorscheme_name, sites_df, color_by, ): """Add a color scheme to the color registry. The scheme can then be added to a `ngview.widget.NGLWidget` as described `here <https://github.com/dwhswenson/contact_map/pull/62>`_. For instance:: view.add_cartoon(color=colorscheme_name) Parameters ---------- colorscheme_name : str Name of the color scheme. sites_df : pandas.DataFrame or str Information on how to color sites. Can either be data frame or name of CSV file with data frame. Must have columns named 'pdb_chain' and 'pdb_site' as well as the column specified by `color_by`. color_by : str or 2-tuple How to color the sites. Can either specify as a str the name of a column in `sites_df` that has the name of a color for each site, or can be the 2-tuple `(val_col, color_map)`. In this case, `val_col` is name of column with numerical values, and `color_map` is a :class:`pdb_prot_align.colorschemes.ValueToColorMap` that maps the numbers in this column to colors. If colors are specified as str and are hex, then they need to be like this '#25828e'. """ site_col = 'pdb_site' chain_col = 'pdb_chain' if isinstance(sites_df, str): sites_df = pd.read_csv(sites_df) elif not isinstance(sites_df, pd.DataFrame): raise ValueError('`sites_df` must be data frame or name of CSV file') if isinstance(color_by, str): color_col = color_by elif (len(color_by) == 2 and isinstance(color_by[0], str) and isinstance( color_by[1], pdb_prot_align.colorschemes.ValueToColorMap)): val_col = color_by[0] if val_col not in sites_df.columns: raise ValueError(f"`sites_df` lacks column {val_col}") color_map = color_by[1] color_col = 'color' if color_col in sites_df.columns: raise ValueError(f"`sites_df` can not have column {color_col} " 'if `color_by` is a 2-tuple') sites_df = (sites_df.assign( color=lambda x: x[val_col].map(color_map.val_to_color))) cols = [site_col, chain_col, color_col] for col in cols: if col not in sites_df.columns: raise ValueError(f"`sites_df` lacks column {col}") # Drop duplicate and NaN rows, which could be case if data frame # is tidy and has amino acid identity, and ensure site is integer. sites_df = (sites_df[cols].drop_duplicates().dropna().assign( **{site_col: lambda x: x[site_col].astype('int')})) # make sure just one color per site / chain dups = (sites_df.groupby( [chain_col, site_col]).aggregate(nrows=pd.NamedAgg(color_col, 'count')).query( 'nrows > 1').reset_index()[[chain_col, site_col]]) if len(dups): raise ValueError('non-unique colors for some sites:\n' + str(dups)) # Do the coloring; details on selection schemes: # https://github.com/arose/ngl/blob/master/doc/usage/selection-language.md colorscheme = [] for tup in sites_df.itertuples(): chain = getattr(tup, chain_col) resi = getattr(tup, site_col) if isinstance(resi, float): if resi != int(resi): raise ValueError(f"non-integer residue {resi}") resi = int(resi) sel_str = f":{chain} and {resi}" color = getattr(tup, color_col) colorscheme.append([color, sel_str]) nglview.color.ColormakerRegistry.add_scheme(colorscheme_name, colorscheme)
#sampling the data to the console print(data.head()) #*********************************************************************************************************************************************************** #********************************************* LOGIC EXTRACT A CSV HAVING STATE, COUNTY, YEAR, VOTES_BY_PARTY, PARTY, CANDIDATE & VOTES******************************************** #*********************************************************************************************************************************************************** print( '***************************** STATE, COUNTY, YEAR, VOTES_BY_PARTY, PARTY, CANDIDATE & VOTES *****************************' ) #creating a dataframe with with candidatevotes aggregated across countys in states data_cnty_sum = pd.DataFrame( data.groupby(['state', 'county', 'year', 'party', 'candidate']).agg(votes_by_party_cnty=pd.NamedAgg( column='candidatevotes', aggfunc=sum))).reset_index() print( '***************************** GENERATING UNIQUE STRING FOR DATA_CNTY_SUM *****************************' ) #creating a new column called unique string. This unique string will be used to join with another dataframe data_cnty_sum['Unique_string'] = data_cnty_sum['year'].map(str).str.strip()+\ data_cnty_sum['state'].map(str).str.strip()+\ data_cnty_sum['county'].map(str).str.strip()+\ data_cnty_sum['votes_by_party_cnty'].map(str).str.strip() print( '***************************** GENERATING UNIQUE STRING FOR DATA_CNTY_SUM *****************************' ) print(data_cnty_sum.head(1)) #creating dataframe with candidatevotes aggregated across county to find out who the winner was in a given year
def data_exploration(): st.title('Data exploration') st.write(''' In this page, I develop the diggings I did before starting to implement a solution. I have a strong belief that this exploration part could go a lot further with more time allocated to it. In particular, I did not explore friends interactions. ''') ######################################################################## notifications = read_csv(csv_path) write_df('Inital dataset', notifications) ######################################################################## nb_notifications = len(notifications) nb_users = len(notifications['user_id'].unique()) nb_friend = len(notifications['friend_id'].unique()) nb_users_days = len(notifications[['user_id', 'day']].drop_duplicates()) duration = (notifications['timestamp'].max() - notifications['timestamp'].min()).days st.write('\n'.join([ f'### Key numbers\n' f'- Sent **{nb_notifications} notifications**', f'- to **{nb_users} users**', f'- from **{nb_friend} friends**', f'- over **{duration} days**', # f'- {nb_users_days} user-day pairs', ])) st.write(''' ### Dataset splitting First thing I did with this dataset was to split it into a training and a test dataset. The idea is see if the hypothesis I was doing on one part of the data were still true on the other one. In our case, a random sampling of the notifications is not efficient since they can be very correlated. What I did is : - split the users (receivers) in two groups - temporally split the notifications in half This gave me 4 datasets : - a training dataset, only on data from August - a testing dataset only on data from August, in order to measure performance on a different subset of users. - a testing dataset only on data from September but with training users, in order to measure the difference between the month of August and the month of September. - a testest dataset with test users and September data, in order to combine both benefits. To be honest, even if I splitted the dataset in those 4 parts, I did not have the time to really study the differences between them. Also, I realized that a better split was possible by distinguishing groups of friend. In my splitting, a friend could have sent a notification at the same time to two different users, one in the train set and one in the test set. ''') st.info(f'Data loaded from {csv_path}') ######################################################################## st.write('### Histogram of average nb of notification per day per user') count_per_user_per_day, over_notified_users, fig = histograms(notifications) st.write(fig) st.write(''' Figure that shows the average number of notifications per day per user. What we can get out of this figure the fact that most users are below the threshold (4 notifications per day) in most of the cases. However, a non-negligeable part of the users are regularly spammed. ''') write_df('Over notififed users', over_notified_users) st.write(f'**Cumulated number of days with a user over notified :** {len(over_notified_users)}') ######################################################################## stats = compute_stats(notifications) st.write('### Repartition of users') st.write(''' Below figure show a repartition of the users based on the notifications they receive. Each point represent a group of users. The larger is the dot, the more users there is. In abscissa, the number of different days the user received a notification. In ordinate, the maximum number of notifications the user received in a single day. In a way, all users below the threshold (red line) don't need any help from our bundler. A good predictor of whether the user belongs to this category or not would be a really good way to assure a minimal delay to most of our users. This option is briefly discussed in the DelayPredictor section. ''') fig = px.scatter( stats, x='nb_days_with_notifications', y='max_notifications_in_a_day', size='log_nb_users', color='critical', hover_data=['nb_users'], ) fig.add_shape( go.layout.Shape( type="line", x0=0, y0=4.5, x1=max(stats['nb_days_with_notifications']), y1=4.5, line={'color': 'red', 'width': 3} )) fig.update_layout(yaxis_type="log") st.write(fig) ######################################################################## st.write('### Per-user exploration') st.write(''' Often in datascience projects, it is a good idea to look precisely to the data line by line. In the figure below, we can explore the notifications for each users one by one. Each point is a notification. A notification is labelled as 'critical' when at least 4 notifications have been sent the same day for that user. In abscissa, the timestamp of the notification in second of the day. In ordinate, the day the notification has been received. ''') user_ids = notifications['user_id'].unique().tolist() user_id = st.selectbox('User id', user_ids[:100]) user_notifications = get_user_notifications(notifications, user_id) fig = px.scatter( user_notifications, x='second_in_day', y='day', color='critical', hover_data=['second_in_day'], ) st.write(fig) ######################################################################## st.write('## By-day repartition') st.write(''' Last intuition I wanted to confirm was the fact the number of notifications per day varies a lot depending on the day. First some days might be more sunny than others (so more people go for a tour). Second, some days of the week are more appropriate for a little hike. I also wanted to explore how early in the day we could predict that the day would be a good day. On the figures we can see that around 10-12am, we already have a pretty good idea of what will be the total number of notifications that day. However, I did not had the time to implement a proper algorithm to take advantage of it. ''') for groupby_key in ['day', 'day_of_week']: st.write(f'### Count notifications grouped by {groupby_key}') kwargs = { 'count': pd.NamedAgg('timestamp', 'count'), # 'day_of_week': pd.NamedAgg('day_of_week', lambda x: x.iloc[0]), } for hour in HOURS: kwargs[f'count_before_{hour}'] = pd.NamedAgg(f'is_before_{hour}', 'sum') counts_per_day = notifications.groupby(groupby_key).agg(**kwargs).reset_index() # write_df('counts_per_day', counts_per_day) fig = go.Figure() for hour in HOURS: hour_key = f'count_before_{hour}' fig.add_trace( go.Scatter( x=counts_per_day[groupby_key], y=counts_per_day[hour_key], mode='lines', name=hour_key, ) ) st.write(fig)
def main(start_date, end_date, tipo_calcolo, path_anagrafica_pdr, path_anagrafica_pdr2, path_anagrafica_osservatori, path_wkr, path_output): #BASE PATH SU S3 path_to_data = 's3://zus-qa-s3/' #DIVISIONE PERIODO DI CONTO IN SOTTO PERIODI CON LUNGHEZZA MASSIMA DI UN MESE, START_COUNT E END_COUNT SONO I NOMI DELLE DATE CHE DELIMITANO GLI INTERVALLI date_format='%Y%m%d' start_d=datetime.strptime(start_date,date_format) end_d=datetime.strptime(end_date,date_format) N_NUM=end_d.month + (12 - start_d.month + 1) + (end_d.year - start_d.year - 1) * 12 START_COUNT,END_COUNT=comp(start_d,end_d,N_NUM) year = start_date[:4] #LETTURA PROFILI ELABORATI df_profili = read_profili(path_to_data +'preprocessato/sistema/coefficienti/external/' + year + '/profili_elaborati.csv', start_date, end_date) print('read from ' + path_to_data +'preprocessato/sistema/coefficienti/external/' + year + '/profili_elaborati.csv') #LETTURA WKR df_wkr = read_wkr(start_date, end_date, tipo_calcolo, path_to_data + path_wkr) print('read from ' + path_to_data + path_wkr) #LETTURA ANAGRAFICA PDR 1 df_pdr= read_pdr(path_to_data + path_anagrafica_pdr) print('read from ' + path_to_data + path_anagrafica_pdr) #LETTURA ANAGRAFICA PDR 2 if path_anagrafica_pdr2 and (tipo_calcolo == 'cons'): df_pdr2 = read_pdr(path_to_data+path_anagrafica_pdr2) print('read from ' + path_to_data+path_anagrafica_pdr2) #LETTURA ANAGRAFICA OSSERVATORI df_anagrafica_osservatori = read_osservatori(path_to_data + path_anagrafica_osservatori) print('read from ' + path_to_data + path_anagrafica_osservatori) columns=['START_DATE', 'END_DATE', 'TIPO_CALCOLO', 'CONSUMO_ANNUO_ANOMALIE', 'PATH_ANAGRAFICA_PDR', 'PATH_ANAGRAFICA_OSSERVATORI', 'PATH_WKR'] df_metadata = pd.DataFrame([[start_date, end_date, tipo_calcolo, 0.0, path_anagrafica_pdr, path_anagrafica_osservatori, path_wkr]], columns=columns) for i, j in zip(START_COUNT, END_COUNT): print(i, j) df_coef_month = df_profili.loc[(df_profili['DATE'] >= i) & (df_profili['DATE'] <= j)] anno_mese = df_coef_month['ANNO_MESE'].unique()[0].replace("-", "").replace("/","") print('coef filtered per month') df_pdr_month = df_pdr.loc[(df_pdr['DATA_FINE'] >= i) & (df_pdr['DATA_INIZIO'] <= j)] if df_pdr_month.empty: df_pdr_month = df_pdr2.loc[(df_pdr2['DATA_FINE'] >= i) & (df_pdr2['DATA_INIZIO'] <= j)] print('using anagrafica pdr2 for ', i, j) df_pdr_month_ee = df_pdr_month.loc[df_pdr_month['SOCIETA'] == 'edison_energia'] df_pdr_month_sg = df_pdr_month.loc[df_pdr_month['SOCIETA'] == 'societa_gruppo'] df_pdr_month_gr = df_pdr_month.loc[df_pdr_month['SOCIETA'] == 'grossisti'] df_pp_pdr_aggr_month_ee, df_pp_pdr_aggr_station_tipo_tratt_month_ee, df_pp_pdr_aggr_station_societa_profilo_tratt_month_ee, df_pp_pdr_checks_ee = mergeDati(df_coef_month, df_pdr_month_ee, df_anagrafica_osservatori, df_wkr, anno_mese, 'ee', path_to_data + path_output) print('computed edison energia') df_pp_pdr_aggr_month_sg, df_pp_pdr_aggr_station_tipo_tratt_month_sg, df_pp_pdr_aggr_station_societa_profilo_tratt_month_sg, df_pp_pdr_checks_sg = mergeDati(df_coef_month, df_pdr_month_sg, df_anagrafica_osservatori, df_wkr, anno_mese, 'sg', path_to_data + path_output) print('computed societa gruppo') df_pp_pdr_aggr_month_gr, df_pp_pdr_aggr_station_tipo_tratt_month_gr, df_pp_pdr_aggr_station_societa_profilo_tratt_month_gr, df_pp_pdr_checks_gr = mergeDati(df_coef_month, df_pdr_month_gr, df_anagrafica_osservatori, df_wkr, anno_mese, 'gr', path_to_data + path_output) print('computed grossisti') df_pp_pdr_aggr = df_pp_pdr_aggr_month_ee.append(df_pp_pdr_aggr_month_sg).append(df_pp_pdr_aggr_month_gr) df_pp_pdr_aggr_station_tipo_tratt = df_pp_pdr_aggr_station_tipo_tratt_month_ee.append(df_pp_pdr_aggr_station_tipo_tratt_month_sg).append(df_pp_pdr_aggr_station_tipo_tratt_month_gr) df_pp_pdr_aggr_station_societa_profilo_tratt = df_pp_pdr_aggr_station_societa_profilo_tratt_month_ee.append(df_pp_pdr_aggr_station_societa_profilo_tratt_month_sg).append(df_pp_pdr_aggr_station_societa_profilo_tratt_month_gr) df_pp_pdr_checks = df_pp_pdr_checks_ee.append(df_pp_pdr_checks_sg).append(df_pp_pdr_checks_gr) print('computation ended') df_pp_pdr_aggr.to_csv(path_to_data + path_output + anno_mese + "/" + 'aggregato_societa_tipo_tratt.csv') print('aggregato_grafico written') df_pp_pdr_aggr_station_tipo_tratt.to_csv(path_to_data + path_output + anno_mese + "/" + 'aggregato_station_tipo_tratt.csv') print('aggregato_station_tipo_tratt written') df_pp_pdr_aggr_station_societa_profilo_tratt.to_csv(path_to_data + path_output + anno_mese + "/" + 'aggregato_station_societa_profilo_tratt.csv') print('aggregato_station_societa_profilo_tratt written') df_pp_pdr_checks.to_csv(path_to_data + path_output + anno_mese + "/" + 'anomalie_dettaglio.csv') df_pp_pdr_checks['TOT_CONSUMO_ANNUO'] = df_pp_pdr_checks.groupby(['PDR']).agg(TOT_CONSUMO_ANNUO=pd.NamedAgg(column='CONSUMO_ANNUO', aggfunc='mean')).reset_index()['TOT_CONSUMO_ANNUO'].sum() df_pp_pdr_checks['TOT_PDR'] = len(df_pp_pdr_checks['PDR'].unique()) #df_pp_pdr_kpi_checks = df_pp_pdr_checks[['TOT_PDR', 'TOT_CONSUMO_ANNUO']].drop_duplicates() df_metadata['CONSUMO_ANNUO_ANOMALIE'] = df_metadata['CONSUMO_ANNUO_ANOMALIE'] + df_pp_pdr_checks['TOT_CONSUMO_ANNUO'].sum() #df_pp_pdr_kpi_checks.to_csv(path_to_data + path_output + anno_mese + "/" + 'anomalie_aggregato.csv') print('dettaglio anomalie written') df_metadata.to_csv(path_to_data + path_output + anno_mese + "/" + 'metadati.csv') print('all months have been computed') return (path_to_data + path_output)
columns=columns, ) df_energy_hh.reset_index(inplace=True) df_energy_hh.rename(columns={'index': 'interval'}, inplace=True) df_energy_hh['interval'] = (df_energy_hh['interval'] % 48) + 1 df_energy_hh.set_index('interval_date', inplace=True) for nc in numeric_columns: df_energy_hh[nc] = pd.to_numeric(df_energy_hh[nc]) df_energy_hh['gross_usage_kwh'] = df_energy_hh['meter_consumption_kwh'] + \ (df_energy_hh['solar_generation_kwh'] - df_energy_hh['meter_generation_kwh'] - df_energy_hh['charge_quantity_kwh']) + df_energy_hh['discharge_quantity_kwh'] df_energy_daily = df_energy_hh.groupby(['interval_date']).agg( meter_consumption_kwh=pd.NamedAgg(column='meter_consumption_kwh', aggfunc='mean'), meter_generation_kwh=pd.NamedAgg(column='meter_generation_kwh', aggfunc='mean'), solar_generation_kwh=pd.NamedAgg(column='solar_generation_kwh', aggfunc='mean'), solar_mean_powr_kw=pd.NamedAgg(column='solar_mean_powr_kw', aggfunc='mean'), solar_devices_reporting=pd.NamedAgg(column='solar_devices_reporting', aggfunc='median'), capacity_kw=pd.NamedAgg(column='capacity_kw', aggfunc='mean'), charge_quantity_kwh=pd.NamedAgg(column='charge_quantity_kwh', aggfunc='mean'), discharge_quantity_kwh=pd.NamedAgg(column='discharge_quantity_kwh', aggfunc='mean'), deterioration_state_pct=pd.NamedAgg(column='deterioration_state_pct', aggfunc='mean'),
else: return pd.Series([i for i in range(0, len(df[x]))]) def linear_regression(df: pd.DataFrame, x: str, y: str) -> None: fixed_x = transform_variable(df, x) model = sm.OLS(df[y], sm.add_constant(fixed_x)).fit() print(model.summary()) coef = pd.read_html(model.summary().tables[1].as_html(), header=0, index_col=0)[0]['coef'] df.plot(x=x, y=y, kind='scatter') plt.plot(df[x], [pd.DataFrame.mean(df[y]) for _ in fixed_x.items()], color='green') plt.plot(df_by_sal[x], [coef.values[1] * x + coef.values[0] for _, x in fixed_x.items()], color='red') plt.xticks(rotation=90) plt.savefig(f'img/lr_{y}_{x}.png') plt.close() df = pd.read_csv("csv/typed_uanl.csv") # type: pd.DataFrame #print_tabulate(df.head(50)) df_by_sal = df.groupby("Fecha")\ .aggregate(sueldo_mensual=pd.NamedAgg(column="Sueldo Neto", aggfunc=pd.DataFrame.mean)) # df_by_sal["sueldo_mensual"] = df_by_sal["sueldo_mensual"]**10 print_tabulate(df_by_sal.head(5)) linear_regression(df_by_sal, "Fecha", "sueldo_mensual")
months_2017 = list(range(201701, 201713)) months_2018 = list(range(201801, 201813)) months_2019 = list(range(201901, 201913)) work_months = months_2017 + months_2018 + months_2019 work_months = list(map(str, work_months)) # list of customers ID custumers = df.customer_id.unique().tolist() # create a DF customer_id, months mc = list(itertools.product(custumers, work_months)) mc_df = pd.DataFrame(mc, columns=['customer_id', 'date']) # aggregate the transactions transactions = df.groupby(by=['customer_id', 'date']).agg( nb_transaction=pd.NamedAgg(column="product_id", aggfunc="count")).reset_index() # full years data (months without any transaction are included) full_transactions_df = transactions.merge(mc_df, on=['customer_id', 'date'], how='right').fillna(0) # create the Data Set base_date = parser.parse("2018-12-16T22:39:59.247Z") data_set_list = [] for i in range(1, 13): # built sliding period cursor_date = base_date + relativedelta(months=i - 1) year_before = cursor_date + relativedelta(months=-11) tree_months_after = cursor_date + relativedelta(months=3)
def modeling_n_prediction(df, device, position): # Filter table with keyword from mobile devices and position less than equal to 10 data = df[(df['device'] == device) & (df['position'] <= position)] data['rank'] = data['position'].astype(int) # Order the tables data = data.sort_values(["keyword", "date", "rank"], ascending=(True, True, True)) # Group keywords, date and rank and calculate sum of clicks and impressions grouped_data = data.groupby(['keyword', 'date', 'rank']).agg( all_clicks=pd.NamedAgg(column='clicks', aggfunc=sum), all_impressions=pd.NamedAgg(column='impressions', aggfunc=sum)) grouped_data = grouped_data.reset_index() grouped_data = grouped_data.sort_values(["keyword", "date", "rank"], ascending=(True, True, True)) # Get the list of unique keywords in google search console data mobile_keywords = grouped_data['keyword'].unique() grouped_data['keyword'] = grouped_data['keyword'].astype(str) key_date_df_list = {} count = 1 for key in list(keyword_master['keywords'].unique()): key_df_list = {} print(count) print('Processing for keyword: ', key) print() distance = [ dis.get_jaro_distance(key, word) for word in mobile_keywords ] distance = np.array(distance) cluster = np.where(distance <= 0.3) total_count = len(mobile_keywords[cluster]) - 1 words = '|'.join(mobile_keywords.tolist()) key_df = pd.DataFrame( columns=['keyword', 'date', 'rank', 'clicks', 'impressions']) dt_list = list(grouped_data['date'].drop_duplicates().astype(str)) dt_list.sort() apply1(dt_list, key_df_list, grouped_data, mobile_keywords, cluster, key) temp_df = pd.DataFrame( columns=['keyword', 'date', 'rank', 'clicks', 'impressions']) for k, val in key_df_list.items(): temp_df = pd.concat([temp_df, val], ignore_index=True) key_date_df_list[key] = temp_df count = count + 1 t_df = pd.DataFrame( columns=['keyword', 'date', 'rank', 'clicks', 'impressions']) for k, val in key_date_df_list.items(): t_df = pd.concat([t_df, val], ignore_index=True) all_ranks_df = t_df if (device == 'MOBILE'): ctrs = ctr_df[['position', 'mobile_ctr']] else: ctrs = ctr_df[['position', 'web_ctr']] all_ranks_df['rank'] = all_ranks_df['rank'].astype(int) all_ranks_df['impressions'] = all_ranks_df['impressions'].astype(float) all_ranks_df = pd.merge(all_ranks_df, ctrs, left_on="rank", right_on="position") # Calculate the max and avg impressions for the keyword for each date temp_all_ranks_df = all_ranks_df.groupby(['keyword', 'date']).agg( avg_impressions=pd.NamedAgg(column='impressions', aggfunc=round_mean), max_impressions=pd.NamedAgg(column='impressions', aggfunc=round_max)) temp_all_ranks_df = temp_all_ranks_df.reset_index() all_ranks_df = pd.merge(all_ranks_df, temp_all_ranks_df, on=['keyword', 'date']) # Replace NA values with avg impressions all_ranks_df['impressions'] = all_ranks_df['impressions'].fillna( all_ranks_df['avg_impressions']) all_ranks_df = all_ranks_df.sort_values(["keyword", "date", "rank"], ascending=(True, True, True)) #df['First Season'] = np.where(df['First Season'] > 1990, 1, df['First Season']) all_ranks_df['impressions'] = np.where( all_ranks_df['impressions'] <= all_ranks_df['avg_impressions'], all_ranks_df['max_impressions'], all_ranks_df['impressions']) if (device == 'MOBILE'): all_ranks_df['clicks'] = (all_ranks_df['mobile_ctr'] * all_ranks_df['impressions']) / 100 else: all_ranks_df['clicks'] = (all_ranks_df['web_ctr'] * all_ranks_df['impressions']) / 100 all_ranks_df.clicks = all_ranks_df.clicks.round() all_ranks_df['clicks'] = all_ranks_df['clicks'].astype(int) if (device == 'MOBILE'): all_ranks_df['mobile_ctr'] = None else: all_ranks_df['web_ctr'] = None all_ranks_df['avg_impressions'] = None all_ranks_df['max_impressions'] = None all_ranks_df['keyword'] = all_ranks_df['keyword'].astype(str) all_ranks_df['impressions'] = all_ranks_df['impressions'].astype(int) all_ranks_df['date'] = all_ranks_df['date'].astype(str) casted_df = all_ranks_df.pivot_table(index=['keyword', 'date'], columns='rank', values=['clicks', 'impressions']) casted_df.columns = [ "{0}_{1}".format(l1, l2) for l1, l2 in casted_df.columns ] casted_df = casted_df.reset_index() casted_df['keyword'] = casted_df['keyword'].astype('category') key_pred_list = {} for key in list(keyword_master['keywords'].unique()): print('Forecasting for keyword - ', key) print() pred_pos_list = {} for position in range(1, 11): print('Position - ', position) print() key_sub = casted_df[casted_df['keyword'] == key] key_sub['date'] = pd.to_datetime(key_sub['date']) clicks_trend = key_sub[['clicks_' + str(position), 'date']] clicks_trend.columns = ["y", "ds"] prediction_days = 14 pred_len = 0 totalRow = len(clicks_trend) pred_range = [totalRow - pred_len + 1, totalRow] pre_views = clicks_trend.head(totalRow - pred_len) post_views = clicks_trend.tail(pred_len) m = fbprophet.Prophet() m.fit(pre_views) future = m.make_future_dataframe(periods=prediction_days) fcast = m.predict(future) pred_df = fcast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail(prediction_days) pred_df['position'] = position pred_df['keyword'] = key pred_df.columns = [ "date", "clicks", "clicks_lower", "clicks_upper", "position", "keyword" ] pred_df = pred_df[[ "keyword", "date", "position", "clicks", "clicks_lower", "clicks_upper" ]] pred_df.clicks_upper = pred_df.clicks_upper.round() pred_df.clicks_lower = pred_df.clicks_lower.round() #fig1 = m.plot(fcast) pred_pos_list[position] = pred_df t1_df = pd.DataFrame(columns=[ "keyword", "date", "position", "clicks", "clicks_lower", "clicks_upper" ]) for k, val in pred_pos_list.items(): t1_df = pd.concat([t1_df, val], ignore_index=True) key_pred_list[key] = t1_df print('\n') t2_df = pd.DataFrame(columns=[ "keyword", "date", "position", "clicks", "clicks_lower", "clicks_upper" ]) for k, val in key_pred_list.items(): t2_df = pd.concat([t2_df, val], ignore_index=True) pred_key_df = t2_df casted_pred_df = pred_key_df.pivot_table( index=['keyword', 'date'], columns='position', values=['clicks', 'clicks_lower', 'clicks_upper']) casted_pred_df.columns = [ "{0}_{1}".format(l1, l2) for l1, l2 in casted_pred_df.columns ] casted_pred_df = casted_pred_df.reset_index() casted_pred_df = pd.merge(keywords_df, casted_pred_df, left_on="keywords", right_on="keyword") # casted_df['impressions'] = np.where(all_ranks_df['impressions'] <= all_ranks_df['avg_impressions'], all_ranks_df['max_impressions'], all_ranks_df['impressions']) #print(casted_pred_df['date']) casted_pred_df['date'] = casted_pred_df['date'].astype(str) casted_pred_df = casted_pred_df.astype(int, errors='ignore') #casted_pred_df['date'] = casted_pred_df['date'].astype(str) num = casted_pred_df._get_numeric_data() num[num < 0] = 0 #print(casted_pred_df['date']) casted_pred_df.to_json(r'FinalResults_UK_' + device + '.json', orient='records') return list([casted_df, casted_pred_df])
#%% import pandas as pd #%% data = pd.read_csv("data/projeto4_telecom_treino.csv") #%% data.columns #%% feature = "account_length" churn_by_feature = data.groupby( [feature, "churn"]).agg(count=pd.NamedAgg(column="churn", aggfunc="count")) rows_present = [row[0] for row in churn_by_feature.iterrows()] rows_absent = [(value, churn_) for value in data[feature].unique() for churn_ in ["no", "yes"] if (value, churn_) not in rows_present] rows_absent = pd.DataFrame( rows_absent, columns=[feature, "churn"], ) rows_absent["count"] = 0 churn_by_feature = churn_by_feature.reset_index() churn_by_feature = churn_by_feature.append(rows_absent)
'cum_confirmed_cases', 'cum_deaths', 'cum_recoveries' ]] # for big cities, adjust adm level mask = df['adm1_name'].isin(['Shanghai', 'Beijing', 'Tianjin', 'Chongqing']) df.loc[mask, 'adm3_name'] = df.loc[mask, 'adm2_name'].tolist() df.loc[mask, 'adm2_name'] = df.loc[mask, 'adm1_name'].tolist() # drop cases unassigned to cities df = df.loc[df['notes'] != 'prison', :] df = df.loc[~df['adm2_name'].isin( ['International Imported Cases', 'Domestic Imported Cases', 'Unknown']), :] # aggregate to city level df = df.groupby(['adm1_name', 'adm2_name', 'date']).agg( cum_confirmed_cases=pd.NamedAgg(column='cum_confirmed_cases', aggfunc=np.nansum), cum_deaths=pd.NamedAgg(column='cum_deaths', aggfunc=np.nansum), cum_recoveries=pd.NamedAgg(column='cum_recoveries', aggfunc=np.nansum), ).reset_index() # fill adm0_name variable df.loc[:, 'adm0_name'] = 'CHN' ## Merge with pre 01/24 data, create balanced panel # merge with pre 1/24 data df = pd.concat([df, df_jan_merged], sort=False) # createa balanced panel adm = df.loc[:, ['adm0_name', 'adm1_name', 'adm2_name']].drop_duplicates() days = pd.date_range(start='20200110', end=end_date)
import pandas as pd app = dash.Dash(__name__) server = app.server df = pd.read_csv("netflix_titles.csv") df.drop_duplicates(inplace=True) pie_fig = px.pie(data_frame=df, names='type', hole=0.8, title='TV Show vs. Movie') bar_fig = px.bar(data_frame=df.groupby( ["type"], as_index=False).agg(count=pd.NamedAgg(column="type", aggfunc="count")), x='type', y='count', color='type', title='TV Show vs. Movie') app.layout = html.Div(children=[ html.H1(children='Visualizing Netflix Data With Python'), html.Div(children=''' Using Pandas, Plotly Express, and Dash. '''), html.Div([ dcc.Graph(id='graph1', figure=pie_fig), ]), html.Div([ dcc.Graph(id='graph2', figure=bar_fig),
mtcarsDF.groupby('gear').size() mtcarsDF.groupby(['gear','cyl']).size() mtcarsDF.groupby(['gear','cyl']).count() #size better mtcarsDF.groupby('gear').mpg.agg('mean') mtcarsDF.groupby('gear')['mpg'].agg('mean') mtcarsDF.groupby('gear')['mpg','wt'].agg('mean') mtcarsDF.groupby('gear')['mpg','wt'].agg(['mean','max']) mtcarsDF.groupby('gear').agg([np.mean, np.sum]) #all columns, np is faster, numeric values mtcarsDF.groupby('gear')['mpg','wt'].agg([np.mean, np.sum, 'count']) mtcarsDF.groupby('gear')['mpg'].agg([np.mean, np.sum, 'count']).rename(columns={'meanMPG') mtcarsDF.groupby('gear').agg(meanMPG = pd.NamedAgg(column='mpg', aggfunc='mean')) mtcarsDF.groupby(['gear','am']).agg(meanMPG = pd.NamedAgg(column='mpg', aggfunc='max')) mtcarsDF.groupby('gear').agg(meanMPG = pd.NamedAgg(column='mpg', aggfunc='mean'), maxMPG = pd.NamedAgg(column='wt', aggfunc='max')) mtcarsDF['gear'].count() mtcarsDF['gear'].max() mtcarsDF.groupby('gear').mean() mtcarsDF.groupby('gear').mean().add_prefix('MEAN_') gearGp = mtcarsDF.groupby('gear') gearGp.mean() gearGp.nth(1) gearGp.nth([1,3]) #crosstab
def orderABC(folder_path, file_list, save_path, period, rate=None): if rate is None: rate = [0.8, 0.95, 1] class_type = ['A', 'B', 'C'] df_list = [] for i in range(len(file_list)): t = pd.read_csv('{}/{}'.format(folder_path, file_list[i]), encoding='gbk') df_list.append(t) df = pd.concat(df_list) df['CATE'] = 'FTW' df.loc[(df['CATEGORY'] != 'FTW'), ['CATE']] = 'APP' df['Period'] = period print('all data size: ', df.shape) sku = df.groupby('SKU').agg(line=pd.NamedAgg(column='DATEOUT', aggfunc='count'), qty=pd.NamedAgg(column='QTY', aggfunc='sum')).reset_index() sku_cate = df[['SKU', 'Period', 'CATE']].drop_duplicates() sku = pd.merge(sku_cate, sku, on='SKU', how='left') sku_FTW = sku.loc[sku['CATE'] == 'FTW'].copy() sku_APP = sku.loc[sku['CATE'] == 'APP'].copy() save_file_name = 'skuABC_{}.xlsx'.format(period) write = pd.ExcelWriter('{}/{}'.format(save_path, save_file_name)) skuABC = class_ABC(write, sku, rate, class_type) skuABC_FTW = class_ABC(write, sku_FTW, rate, class_type) skuABC_APP = class_ABC(write, sku_APP, rate, class_type) write.save() write.close() '''订单ABC组合''' ## 订单ABC组合 skuABC_cate = skuABC_FTW[['SKU', 'lineABC', 'qtyABC']].append( skuABC_APP[['SKU', 'lineABC', 'qtyABC']]) df = pd.merge(df, skuABC_cate, on='SKU', how='left') df_temp = df[['Period', 'DATEOUT', 'DOCNO', 'DATA_TYPE', 'CHANNEL']].drop_duplicates() ## 统计一个订单中SKU 行数ABC的个数 df_order_lineABC = pd.pivot_table(df, index=['DOCNO'], columns='lineABC', values='SKU', aggfunc='count', fill_value=0).reset_index() print(df_order_lineABC.columns) cols = list(df_order_lineABC.columns[1:]) print(cols) x = np.where(df_order_lineABC[cols], cols, '') df_order_lineABC['orderLineABC'] = pd.Series(''.join(i) for i in x) ## 统计一个订单中SKU 件数ABC的个数 df_order_qtyABC = pd.pivot_table(df, index=['DOCNO'], columns='qtyABC', values='SKU', aggfunc='count', fill_value=0).reset_index() print(df_order_qtyABC.columns) cols = list(df_order_qtyABC.columns[1:]) print(cols) y = np.where(df_order_qtyABC[cols], cols, '') df_order_qtyABC['orderQtyABC'] = pd.Series(''.join(i) for i in y) # print(df_order_lineABC.head(20)) # print(df_order_qtyABC.head(20)) df_order = pd.merge(df_order_lineABC[['DOCNO', 'orderLineABC']], df_order_qtyABC[['DOCNO', 'orderQtyABC']], on='DOCNO', how='left') df_result = pd.merge(df_temp, df_order, on='DOCNO', how='left') # print(df_result.head(20)) save_file_name = 'orderABC_{}.csv'.format(period) df_result.to_csv('{}/{}'.format(save_path, save_file_name), encoding='gbk', index=False)
def inverse_simpson_index( barcodecounts, *, barcodecol="barcode", countcol="count", groupcols="library", ): """Inverse Simpson index (reciprocal probability two barcodes are same). Parameters ---------- barcodecounts: pandas.DataFrame Data frame with barcode counts barcodecol : str Column in ``barcodecounts`` listing all unique barcodes. countcol : str Column in ``barcodecounts`` with counts of each barcode. groupcols : str, list, or None Columns in ``barcodecounts`` by which we group for calculations. Returns ------- pandas.DataFrame Example ------- >>> barcodecounts = pd.DataFrame.from_records( ... [('lib1', 'AA', 10), ... ('lib1', 'AT', 20), ... ('lib1', 'AC', 30), ... ('lib2', 'AA', 5)], ... columns=['library', 'barcode', 'count']) >>> inverse_simpson_index(barcodecounts) library inverse_simpson_index 0 lib1 2.571429 1 lib2 1.000000 """ # based on here: https://gist.github.com/martinjc/f227b447791df8c90568 reserved_cols = ["dummy", "p2", "simpson_index", "inverse_simpson_index"] for col in reserved_cols: if col in barcodecounts.columns: raise ValueError(f"`barcodecounts` cannot have column {col}") if groupcols: if isinstance(groupcols, str): groupcols = [groupcols] else: groupcols = ["dummy"] barcodecounts["dummy"] = "dummy" req_cols = [barcodecol, countcol, *groupcols] if not set(barcodecounts.columns).issuperset(req_cols): raise ValueError(f"`barcodecounts` lacks columns {req_cols}") if len(barcodecounts) != len(barcodecounts.groupby(req_cols)): raise ValueError("`barcodecol` and `groupcols` not unique rows") df = (barcodecounts.assign(p2=lambda x: (x[countcol] / (x.groupby( groupcols)[countcol].transform("sum")))**2).groupby( groupcols, as_index=False).aggregate( simpson_index=pd.NamedAgg("p2", "sum")).assign( inverse_simpson_index=lambda x: 1 / x["simpson_index"])) if groupcols == ["dummy"]: groupcols = [] return df[[*groupcols, "inverse_simpson_index"]]
def order_by_date(folder_path, save_path, index=None): if index is None: index = ['DATEOUT', 'DOCNO'] save_file_name = 'order_by_' + '_'.join(index) + '.csv' os.chdir(folder_path) file_list = os.listdir() # 将该文件夹下的所有文件名存入一个列表 for i in range(len(file_list)): df = pd.read_csv('{}/{}'.format(folder_path, file_list[i]), encoding='gbk', low_memory=False) df['CATE'] = 'FTW' df.loc[(df['CATEGORY'] != 'FTW'), ['CATE']] = 'APP' # print(file_list[i], '非重复订单数: ', df['DOCNO'].nunique()) # ### merge order DATA_TYPE, CHANNEL type = df[['DATEOUT', 'DOCNO', 'DATA_TYPE', 'CHANNEL']].drop_duplicates().reset_index() skuNum = df.groupby(index)['SKU'].nunique() qty = df.groupby(index).agg( Qty=pd.NamedAgg(column='QTY', aggfunc='sum')) re0 = pd.merge(type, skuNum, on=index, how='outer') re = pd.merge(re0, qty, on=index, how='outer') # print(re.columns) # print(re.head(5)) re['order_structure'] = np.NAN re.loc[(re['SKU'] == 1) & (re['Qty'] == 1), ['order_structure']] = '单品单件' re.loc[(re['SKU'] == 1) & (re['Qty'] > 1), ['order_structure']] = '单品多件' re.loc[(re['SKU'] > 1) & (re['Qty'] > 1) & (re['SKU'] == re['Qty']), ['order_structure']] = '多品单件' re.loc[(re['SKU'] > 1) & (re['Qty'] > 1) & (re['SKU'] != re['Qty']), ['order_structure']] = '多品多件' re['order_structure2'] = '单件' re.loc[(re['order_structure'] != '单品单件'), ['order_structure2']] = '多件' order_cate = pd.pivot_table(df, index=index, columns=['CATE'], values=['QTY'], aggfunc=sum, fill_value=0) # print(order_cate.columns) # print(order_cate.head(5)) col = [] for j in order_cate.columns: j = list(j) col.append('_'.join(j)) order_cate.columns = col # print(order_cate.columns) order_cate['order_category'] = '' order_cate.loc[(order_cate['QTY_APP'] > 0) & (order_cate['QTY_FTW'] > 0), ['order_category']] = 'A+F' order_cate.loc[(order_cate['QTY_APP'] > 0) & (order_cate['QTY_FTW'] == 0), ['order_category']] = 'A' order_cate.loc[(order_cate['QTY_APP'] == 0) & (order_cate['QTY_FTW'] > 0), ['order_category']] = 'F' result = pd.merge(re, order_cate, on=index, how='outer') print(i + 1, file_list[i], '订单数:', df['DOCNO'].nunique(), '行数:', result.shape[0], '件数:', df['QTY'].sum()) if i > 0: result.to_csv('{}/{}'.format(save_path, save_file_name), index=False, encoding='gbk', header=False, mode='a+') else: result.to_csv('{}/{}'.format(save_path, save_file_name), encoding='gbk', index=False)
def main(): # data = load_wine() # data = load_breast_cancer() data = load_boston() # data = load_diabetes() X = data.data y = data.target # Determine if response is continuous or boolean if y.dtype is str or bool is True: response_type = "boolean" print("---Response is boolean---") elif np.unique(y).size / y.size < 0.05: response_type = "boolean" print("---Response is boolean---") else: response_type = "continuous" print("---Response is continuous---") # Determine if the predictor continuous or boolean & # create plots for each variable type predictor_type = [] for idx, column in enumerate(X.T): feature_name = data.feature_names[idx] predictor = statsmodels.api.add_constant(column) # Get the stats & plot if column.dtype is str or bool is True: v_type = "boolean" print(data.feature_names[idx], "is boolean") if response_type == "continuous": logistic_regression_model = statsmodels.api.GLM(y, predictor) logistic_regression_model_fitted = ( logistic_regression_model.fit()) # noqa print(f"Variable: {feature_name}") print(logistic_regression_model_fitted.summary()) t_value = round(logistic_regression_model_fitted.tvalues[1], 6) p_value = "{:.6e}".format( logistic_regression_model_fitted.pvalues[1]) # noqa # Categorical Predictor by Continuous Response fig = px.scatter(x=column, y=y) fig.update_layout( title= f"Variable: {feature_name}: (t-value={t_value}) (p-value={p_value})", # noqa xaxis_title=f"Variable: {feature_name}", yaxis_title="y", ) fig.show() else: print(data.feature_names[idx], "is boolean") logistic_regression_model = statsmodels.api.GLM(y, predictor) logistic_regression_model_fitted = ( logistic_regression_model.fit()) # noqa print(f"Variable: {feature_name}") print(logistic_regression_model_fitted.summary()) t_value = round(logistic_regression_model_fitted.tvalues[1], 6) p_value = "{:.6e}".format( logistic_regression_model_fitted.pvalues[1]) # noqa # Continuous Predictor by Continuous Response fig = px.scatter(x=column, y=y) fig.update_layout( title= f"Variable: {feature_name}: (t-value={t_value}) (p-value={p_value})", # noqa xaxis_title=f"Variable: {feature_name}", yaxis_title="y", ) fig.show() elif round((np.unique(X.T[idx]).size / X.T[idx].size), 2) <= 0.05: v_type = "boolean" print(data.feature_names[idx], "is boolean") if response_type == "continuous": logistic_regression_model = statsmodels.api.GLM(y, predictor) logistic_regression_model_fitted = ( logistic_regression_model.fit()) # noqa print(f"Variable: {feature_name}") print(logistic_regression_model_fitted.summary()) t_value = round(logistic_regression_model_fitted.tvalues[1], 6) p_value = "{:.6e}".format( logistic_regression_model_fitted.pvalues[1]) # noqa # Categorical Predictor by Continuous Response fig = px.histogram(x=column, y=y, histfunc="count") fig.update_layout( title= f"Variable: {feature_name}: (t-value={t_value}) (p-value={p_value})", # noqa xaxis_title=f"Variable: {feature_name}", yaxis_title="Response", ) fig.show() else: print(data.feature_names[idx], "is boolean") logistic_regression_model = statsmodels.api.GLM(y, predictor) logistic_regression_model_fitted = ( logistic_regression_model.fit()) # noqa print(f"Variable: {feature_name}") print(logistic_regression_model_fitted.summary()) t_value = round(logistic_regression_model_fitted.tvalues[1], 6) p_value = "{:.6e}".format( logistic_regression_model_fitted.pvalues[1]) # noqa # Categorical Predictor by Continuous Response fig = px.scatter(x=column, y=y) fig.update_layout( title= f"Variable: {feature_name}: (t-value={t_value}) (p-value={p_value})", # noqa xaxis_title=f"Variable: {feature_name}", yaxis_title="y", ) fig.show() else: print(data.feature_names[idx], "is continuous") v_type = "continuous" if response_type == "continuous": linear_regression_model = statsmodels.api.OLS(y, predictor) linear_regression_model_fitted = linear_regression_model.fit() print(f"Variable: {feature_name}") print(linear_regression_model_fitted.summary()) t_value = round(linear_regression_model_fitted.tvalues[1], 6) p_value = "{:.6e}".format( linear_regression_model_fitted.pvalues[1]) # noqa # Continuous Predictor by Continuous Response # Plot the figure fig = px.scatter(x=column, y=y) fig.update_layout( title= f"Variable: {feature_name}: (t-value={t_value}) (p-value={p_value})", # noqa xaxis_title=f"Variable: {feature_name}", yaxis_title="y", ) fig.show() else: linear_regression_model = statsmodels.api.OLS(y, predictor) linear_regression_model_fitted = linear_regression_model.fit() print(f"Variable: {feature_name}") print(linear_regression_model_fitted.summary()) t_value = round(linear_regression_model_fitted.tvalues[1], 6) p_value = "{:.6e}".format( linear_regression_model_fitted.pvalues[1]) # noqa # Continuous Predictor by Categorical Response # Plot the figure fig = px.histogram(x=column, y=y) fig.update_layout( title= f"Variable: {feature_name}: (t-value={t_value}) (p-value={p_value})", # noqa xaxis_title=f"Variable: {feature_name}", yaxis_title="y", ) fig.show() # create a list of each variable type predictor_type.append(v_type) print("***Difference with mean table***") # Create difference with mean table # create a temp table df_bin to store raw data n_of_bin = 10 for idx, column in enumerate(X.T): feature_name = data.feature_names[idx] predictor = column target = data["target"] df = pd.DataFrame({feature_name: pd.Series(predictor)}) df["target"] = target count_row = df.shape[0] p_min = df[feature_name].min() p_max = df[feature_name].max() p_range = p_max - p_min bin_width = p_range / n_of_bin # to include min number bin_list = [p_min - 1] s = p_min # +1 to include max number while s < p_max + 1: s += bin_width bin_list.append(round(s, 0)) df_bin = df df_bin["LowerBin_UpperBin"] = pd.cut( x=df[feature_name], bins=bin_list, include_lowest=True, duplicates="drop", # noqa ) bincenter = [] for bin_n in df_bin["LowerBin_UpperBin"]: bincenter.append(bin_n.mid) df_bin["BinCenters"] = pd.DataFrame( {"BinCenters": pd.Series(bincenter)}) # noqa df_bin["response"] = df["target"] # Groupby df_bin table to create a Difference with mean table df_bin_groupby = df_bin.groupby( ("LowerBin_UpperBin"), as_index=False).agg( # noqa bin_mean=pd.NamedAgg(column=feature_name, aggfunc="mean"), bin_count=pd.NamedAgg(column=feature_name, aggfunc="count"), ) bin_center_list = [] for bin_center in df_bin_groupby["LowerBin_UpperBin"]: bin_center_list.append(bin_center.mid) df_bin_groupby["BinCenter"] = pd.Series(bin_center_list) PopulationMean = (np.sum(column)) / (count_row) df_bin["PopulationMean"] = PopulationMean df_bin_groupby["PopulationMean"] = PopulationMean MeanSquaredDiff = (df_bin_groupby["bin_mean"] - df_bin_groupby["PopulationMean"])**2 df_bin_groupby["MeanSquaredDiff"] = MeanSquaredDiff # Square the difference, sum them up and divide by number of bins print( f"THE unWeighted NUMBER of {feature_name} IS : {df_bin_groupby['MeanSquaredDiff'].sum() / n_of_bin}" # noqa ) print(feature_name, df_bin_groupby) trace1 = go.Bar( x=df_bin_groupby["BinCenter"], y=df_bin_groupby["bin_count"], name="population", ) layout = go.Layout( title_text="Binned Response Mean vs Population Mean") # noqa trace2 = go.Scatter( x=df_bin_groupby["BinCenter"], y=df_bin_groupby["PopulationMean"], name="population mean", ) trace3 = go.Scatter( x=df_bin_groupby["BinCenter"], y=df_bin_groupby["bin_mean"], name="Bin Mean", # noqa ) combined = [trace1, trace2, trace3] fig = go.Figure(data=combined, layout=layout) fig.show() # Difference with mean table (weighted) print("***Difference with mean table (weighted)***") df_bin_groupby_weighted = df_bin_groupby.copy() population_proportion = [] for count in df_bin_groupby["bin_count"]: population_proportion.append(count / count_row) df_bin_groupby_weighted["PopulationProportion"] = pd.Series( population_proportion) df_bin_groupby_weighted["MeanSquaredDiffWeighted"] = ( df_bin_groupby_weighted["MeanSquaredDiff"] * df_bin_groupby_weighted["PopulationProportion"]) # Square the difference, sum them up and divide by number of bins print( f"THE Weighted NUMBER of {feature_name} IS : {df_bin_groupby_weighted['MeanSquaredDiffWeighted'].sum() / n_of_bin}" # noqa ) print(feature_name, df_bin_groupby_weighted) # Random Forest Variable importance ranking print("***Random Forest Variable importance ranking***") model = RandomForestRegressor() model.fit(X, y) # get importance importance = model.feature_importances_ feature = [] score = [] for i, v in enumerate(importance): feature.append(data.feature_names[i]) score.append(round(v, 5)) df_ranking_vartype = pd.DataFrame({ "Feature": pd.Series(feature), "Score": pd.Series(score) }) df_ranking_vartype["Variable_type"] = predictor_type df_ranking_vartype_sort = df_ranking_vartype.sort_values(by=["Score"]) # path that will save the ranking excel file path = "D:\PycharmProjects\BDA696-MuTing\Feature_Importance_and_type.xlsx" # noqa df_ranking_vartype_sort.to_excel(path, index=False) print(df_ranking_vartype_sort)
def mergeDati(df_profili, df_pdr, df_anagrafica_osservatori, df_wkr, anno_mese, societa, path_output): print('computation for societa ' + societa + ' ' + anno_mese + ' started') df_pp_pdr = df_pdr.merge(df_anagrafica_osservatori,on='STATION',how='left') print('merge pdr zona climatica') df_pp_pdr = df_profili.merge(df_pp_pdr,on=['PROFILO']) print('merge pdr profili done') df_pp_pdr = df_pp_pdr.loc[(df_pp_pdr['DATE'] >= df_pp_pdr['DATA_INIZIO']) & (df_pp_pdr['DATE'] <= df_pp_pdr['DATA_FINE'])] print('filter pdr by date') df_pp_pdr = df_pp_pdr.merge(df_wkr,on=['DATE','ZONA_CLIMATICA'],how='left') print('merge pdr wkr') df_pp_pdr = df_pp_pdr.assign(K=df_pp_pdr['C_WKR']*df_pp_pdr['WKR']+df_pp_pdr['C_CONST']) df_pp_pdr = df_pp_pdr.assign(K_NO_WKR=df_pp_pdr['C_WKR']*1+df_pp_pdr['C_CONST']) df_pp_pdr = df_pp_pdr.assign(SMC=df_pp_pdr['K']*df_pp_pdr['CONSUMO_ANNUO']/100) df_pp_pdr = df_pp_pdr.assign(SMC_NO_WKR=df_pp_pdr['K_NO_WKR']*df_pp_pdr['CONSUMO_ANNUO']/100) print('compute k+smc') df_pp_pdr_aggr_societa_tipo_tratt = df_pp_pdr.groupby(['SOCIETA', 'TRATTAMENTO_AGG', 'TIPOLOGIA', 'DATE', 'ANNO_MESE', 'WKR']).agg(SMC=pd.NamedAgg(column='SMC', aggfunc='sum'), SMC_NO_WKR=pd.NamedAgg(column='SMC_NO_WKR', aggfunc='sum'), K=pd.NamedAgg(column='K', aggfunc='sum'), K_NO_WKR=pd.NamedAgg(column='K_NO_WKR', aggfunc='sum'), CONSUMO_ANNUO=pd.NamedAgg(column='CONSUMO_ANNUO', aggfunc='sum'), C_CONST=pd.NamedAgg(column='C_CONST', aggfunc='sum'), C_WKR=pd.NamedAgg(column='C_WKR', aggfunc='sum')).reset_index() print('computed aggregato grafico') df_pp_pdr_aggr_station_tipo_tratt = df_pp_pdr.groupby(['TRATTAMENTO', 'TIPOLOGIA', 'STATION', 'DATE', 'ANNO_MESE', 'WKR']).agg(SMC=pd.NamedAgg(column='SMC', aggfunc='sum'), SMC_NO_WKR=pd.NamedAgg(column='SMC_NO_WKR', aggfunc='sum'), K=pd.NamedAgg(column='K', aggfunc='sum'), K_NO_WKR=pd.NamedAgg(column='K_NO_WKR', aggfunc='sum'), CONSUMO_ANNUO=pd.NamedAgg(column='CONSUMO_ANNUO', aggfunc='sum'), C_CONST=pd.NamedAgg(column='C_CONST', aggfunc='sum'), C_WKR=pd.NamedAgg(column='C_WKR', aggfunc='sum')).reset_index() print('computed aggregato station tipologia trattamento') df_pp_pdr_aggr_station_societa_profilo_tratt = df_pp_pdr.groupby(['TRATTAMENTO', 'PROFILO', 'SOCIETA', 'PIVA', 'STATION', 'DATE', 'ANNO_MESE', 'WKR']).agg(SMC=pd.NamedAgg(column='SMC', aggfunc='sum'), SMC_NO_WKR=pd.NamedAgg(column='SMC_NO_WKR', aggfunc='sum'), K=pd.NamedAgg(column='K', aggfunc='sum'), K_NO_WKR=pd.NamedAgg(column='K_NO_WKR', aggfunc='sum'), CONSUMO_ANNUO=pd.NamedAgg(column='CONSUMO_ANNUO', aggfunc='sum'), C_CONST=pd.NamedAgg(column='C_CONST', aggfunc='sum'), C_WKR=pd.NamedAgg(column='C_WKR', aggfunc='sum')).reset_index() df_pp_pdr_aggr_non_progr = df_pp_pdr.groupby('PDR') print('computed aggregato station societa profilo trattamento') df_pp_pdr_dett = df_pp_pdr[['SOCIETA', 'PIVA', 'TRATTAMENTO', 'TIPOLOGIA', 'PROFILO', 'ZONA_CLIMATICA', 'STATION', 'PDR', 'DATE', 'WKR', 'SMC', 'CONSUMO_ANNUO']] print('extract subset of fields for dettaglio') df_pp_pdr_dett.loc[df_pp_pdr_dett['TRATTAMENTO'] == 'Y'].to_csv(path_output + anno_mese + '/' + 'dettaglio/dettaglio_' + societa + '_y.csv') print('dettaglio y written: ' + df_pp_pdr_dett.loc[df_pp_pdr_dett['TRATTAMENTO'] == 'Y']['PDR'].count().astype(str)) df_pp_pdr_dett.loc[df_pp_pdr_dett['TRATTAMENTO'] != 'Y'].to_csv(path_output + anno_mese + '/' + 'dettaglio/dettaglio_' + societa + '_gm.csv') print('dettaglio gm written: ' + df_pp_pdr_dett.loc[(df_pp_pdr_dett['TRATTAMENTO'] != 'Y')]['PDR'].count().astype(str)) df_pp_pdr_checks = df_pp_pdr_dett.loc[(df_pp_pdr_dett['SMC'].isnull()) | (df_pp_pdr_dett['WKR'].isnull())] print('checks computed: wkr null ' + df_pp_pdr_dett.loc[df_pp_pdr_dett['SMC'].isnull()]['PDR'].count().astype(str) + ' smc null ' + df_pp_pdr_dett.loc[df_pp_pdr_dett['WKR'].isnull()]['PDR'].count().astype(str)) return df_pp_pdr_aggr_societa_tipo_tratt, df_pp_pdr_aggr_station_tipo_tratt, df_pp_pdr_aggr_station_societa_profilo_tratt, df_pp_pdr_checks
(("y", "A"), np.min), (("y", "B"), "mean"), [1, 3], [0, 2], [5.5, 7.5], ), ( (("y", "A"), lambda x: max(x)), (("y", "A"), lambda x: 1), (("y", "B"), "mean"), [1, 3], [1, 1], [5.5, 7.5], ), ( pd.NamedAgg(("y", "A"), "max"), pd.NamedAgg(("y", "B"), np.mean), pd.NamedAgg(("y", "A"), lambda x: 1), [1, 3], [5.5, 7.5], [1, 1], ), ], ) def test_agg_relabel_multiindex_column(agg_col1, agg_col2, agg_col3, agg_result1, agg_result2, agg_result3): # GH 29422, add tests for multiindex column cases df = DataFrame({ "group": ["a", "a", "b", "b"], "A": [0, 1, 2, 3], "B": [5, 6, 7, 8]
def getGroupedByIMSI(self, df: pd.DataFrame): # Returns a df with grouped and aggregated values def joinValues(series): return ','.join(map(str, series[series.notnull()].unique())) groupedDf = df.groupby('IMSI').agg( IMSI=pd.NamedAgg(column='IMSI', aggfunc=joinValues), RAT=pd.NamedAgg(column='RAT', aggfunc=joinValues), OPERATOR=pd.NamedAgg(column='OPERATOR', aggfunc=joinValues), CHANNEL=pd.NamedAgg(column='CHANNEL', aggfunc=joinValues), IMEI=pd.NamedAgg(column='IMEI', aggfunc=joinValues), TMSI=pd.NamedAgg(column='TMSI', aggfunc=joinValues), MS_POWER=pd.NamedAgg(column='MS_POWER', aggfunc=joinValues), TA=pd.NamedAgg(column='TA', aggfunc=joinValues), LAST_LAC=pd.NamedAgg(column='LAST_LAC', aggfunc=joinValues), HITS=pd.NamedAgg(column='HITS', aggfunc='size'), DATE_TIME=pd.NamedAgg(column='DATE_TIME', aggfunc=joinValues), ) return groupedDf.reset_index(drop=True)