def relative_change(self, original: pd.core.frame.DataFrame, transformed: pd.core.frame.DataFrame): """ Compute the relative change between the two given sets. Formula: |original - transformed| / f(original, transformed); with f(o,t) = 1/2 * (|o| + |t|) """ return (original - transformed).abs() / ( (original.abs() + transformed.abs()) / 2)
def save_output_table(data: pd.core.frame.DataFrame, file_name: str, index: bool = True, path: str = '') -> None: """Save given DataFrame in the Output folder as a csv file.""" ensure_folder_existence(f'{path}/Output/Tables') data.to_csv(f'{path}/Output/Tables/{file_name}.csv', index=index)
def _clean_data(df: pd.core.frame.DataFrame) -> pd.core.frame.DataFrame: df = df.replace('N.A.', np.nan) df.columns = [inflection.underscore(column) for column in df.columns] # TODO: using a random date to make sqlite happy. Once xl data cleaned, remove this. df['date_of_birth'] = df['date_of_birth'].fillna('01.01.2017') df['date_of_birth'] = df['date_of_birth'].apply(_parse_date) df['image_path'] = df['image_path'].apply(_normalize_slashes) return df
def export_table_csv(table: pd.core.frame.DataFrame, path: str) -> None: """ Export pandas data frame to a csv file. Can be useful if you want to save a copy of the data locally. :param table: Table which we want to export. :param path: Path to which we want to save the file (e.g. r'../local-data/newspapers.csv'). :return: Nothing """ table.to_csv(path)
def insert_hash(df:pd.core.frame.DataFrame) -> pd.core.frame.DataFrame: hash_list = list() for row in df.iterrows(): hash_list.append( hashlib.sha256( str(row).encode() ).hexdigest()) df.insert(0, 'hash', hash_list) return df
def draw_graph(data_set: pd.core.frame.DataFrame): """ Методя для отрисовки графика :param data_set: данные для отрисовки """ data_set.plot(x=X_NAME, y=Y_NAME, style="o") plt.xlabel(X_NAME) plt.ylabel(Y_NAME) plt.show()
def plot_single(single_run: pd.core.frame.DataFrame, metric: str, savepath: str) -> None: fig, ax = plt.subplots(1, 2, figsize=(8, 6), sharex=True, sharey='row', gridspec_kw={ 'wspace': 0, 'hspace': 0 }) box = dict(facecolor='yellow', pad=6, alpha=0.2) ax[0].text(1.0, 1.0, 'BEST RUN', transform=ax[0].transAxes, horizontalalignment='center', verticalalignment='bottom', fontweight='bold') ax[0].text(0.5, 0.98, 'TRAINING', transform=ax[0].transAxes, horizontalalignment='center', verticalalignment='top', bbox=box) ax[1].text(0.5, 0.98, 'EVALUATION', transform=ax[1].transAxes, horizontalalignment='center', verticalalignment='top', bbox=box) train_name = DEFAULT_TRAIN_METRIC valid_name = DEFAULT_VALID_METRIC single_run.plot(x='epoch', y=train_name, ax=ax[0], legend=False) single_run.plot(x='epoch', y=valid_name, ax=ax[1], legend=False) ymin = np.min((np.min( single_run[train_name]), np.min(single_run[valid_name]))) * 0.95 ymax = np.max((np.percentile(single_run[train_name], 95), np.percentile(single_run[valid_name], 95))) xmin = np.min(single_run['epoch']) - np.max(single_run['epoch']) * 0.01 xmax = np.max(single_run['epoch']) * 1.01 ax[0].set_xlim(xmin, xmax) ax[0].set_ylim(ymin, ymax) ax[0].yaxis.set_label_coords(-0.15, 0.5, transform=ax[0].transAxes) ax[0].set_ylabel('loss', bbox=box) fig.savefig(savepath, bbox_inches='tight', dpi=200, transparent=True)
def train_som(som_width: int, som_height: int, df: pd.core.frame.DataFrame, df_train: pd.core.frame.DataFrame, df_test: pd.core.frame.DataFrame, df_train_columns: pd.core.frame.DataFrame, n_iter: int, sigma=0.3, learning_rate=0.01): """ Trains self-organizing map and returns train and test datasets with predicted clusters. Arguments: som_width - width of som map som_height - height of som map df - initially prepared dataset df_train - training dataset df_test - testing dataset df_train_columns - list of columns of training dataset n_iter - number of iteration during training sigma - sigma parameter for the model learning_rate - learning rate Returns: final_df_train - training dataset with predicted cluster final_df_test - testing dataset with predicted cluster """ som = MiniSom(som_width, som_height, df_train.shape[1], sigma=sigma, learning_rate=learning_rate, random_seed=0) som.train(df_train, n_iter) # converting numpy arrays to dataframes df_train = pd.DataFrame(df_train, columns=df_train_columns) df_test = pd.DataFrame(df_test, columns=df_train_columns) # creating column with cluster basing on model prediction df_train['cluster'] = df_train.apply(lambda x: som_predict(x, som), axis=1) df_test['cluster'] = df_test.apply(lambda x: som_predict(x, som), axis=1) # joining train and test dataframes with previously dropped columns, which will be useful in the further part of # the script final_df_train = df_train.join( df[['Date', 'Price', 'close_plus_20_days', 'profit']].iloc[:, :len(df_train)], lsuffix='_org') final_df_test = df_test.join( df[['Date', 'Price', 'close_plus_20_days', 'profit']].iloc[len(df_train):], lsuffix='_org') return final_df_train, final_df_test
def reduce_memory_usage( data: pd.core.frame.DataFrame) -> pd.core.frame.DataFrame: """ DataFrame reduce memory Args: data (pd.core.frame.DataFrame): [description] Returns: pd.core.frame.DataFrame: [description] """ start_memory = data.memory_usage().sum() / 1024**2 print( "Memory usage before optimization is: {:.4f} MB".format(start_memory)) numerics = [ "int16", "int32", "int64", "float16", "float32", "float64", ] for col in data.columns: col_type = data[col].dtypes if col_type in numerics: col_min = data[col].min() col_max = data[col].max() if str(col_type)[:3] == "int": if col_min > np.iinfo(np.int8).min and col_max < np.iinfo( np.int8).max: data[col] = data[col].astype(np.int8) elif col_min > np.iinfo(np.int16).min and col_max < np.iinfo( np.int16).max: data[col] = data[col].astype(np.int16) elif col_min > np.iinfo(np.int32).min and col_max < np.iinfo( np.int32).max: data[col] = data[col].astype(np.int32) elif col_min > np.iinfo(np.int64).min and col_max < np.iinfo( np.int64).max: data[col] = data[col].astype(np.int64) else: if col_min > np.finfo(np.float16).min and col_max < np.finfo( np.float16).max: data[col] = data[col].astype(np.float16) elif col_min > np.finfo(np.float32).min and col_max < np.finfo( np.float32).max: data[col] = data[col].astype(np.float32) else: data[col] = data[col].astype(np.float64) end_memory = data.memory_usage().sum() / 1024**2 print("Memory usage after optimization is: {:.4f} MB".format(end_memory)) print("Memory decreased by {:.1f}%".format( 100 * (start_memory - end_memory) / start_memory)) return data
def pie_chart_margin(column: str, df: pandas.core.frame.DataFrame, title1: str, title2: str, explode: tuple): """ GroupBy Classification, visualization by margin """ df1 = pd.DataFrame(df.groupby('Classification')[column].sum()) df3 = pd.DataFrame( df.groupby('Classification')['totalMonthlyNetSale'].sum()) df4 = df1['SellMargin'] / df3['totalMonthlyNetSale'] print(df3.reset_index()) print( '\nSell Margin % means: TotalSellMargin / totalMonthlyNetSale within classifiction ' ) labels = df1.index fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(15, 5)) # plot 1 sizes1 = df1[column] axes[0].pie(sizes1, explode=explode, labels=labels, autopct='%1.1f%%', shadow=True, startangle=90) axes[0].axis( 'equal') # Equal aspect ratio ensures that pie is drawn as a circle. axes[0].set_title(title1, fontsize=15) # plot 2 sns.barplot(x="Classification", y=df4.reset_index().columns[1], data=df4.reset_index(), ax=axes[1]) axes[1].set_title(title2, fontsize=15) axes[1].set_ylabel( '% SellMargin / totalMonthlyNetSale of this Classification', fontsize=12) #axes[1].pie(df4, explode=explode, labels=labels, autopct='%1.1f%%', #shadow=True, startangle=90) #axes[1].axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle. #axes[1].set_title(title2, fontsize=15) # plot 3 axes[2] = sns.boxplot(x="Classification", y=column, data=df) axes[2] = sns.swarmplot(x="Classification", y=column, data=df, color=".25") axes[2].set_title('Sell Margin Details', fontsize=15) fig.tight_layout() plt.show()
def formatData(dataFrame: pandas.core.frame.DataFrame, keysToGet: List[str] = ['Open', 'High', 'Low', 'Last']): """ """ dataFrame.fillna(method='ffill') formattedData = {'Date': dataFrame.index.strftime("%d-%m-%Y").tolist()} for key in keysToGet: formattedData.update({key: dataFrame[key].tolist()}) return formattedData
def _create(self,data : pd.core.frame.DataFrame) -> None: data.drop(['SepsisLabel'],inplace=True, axis=1,errors='ignore') self.columns = data.columns patients = data['Id'].unique() for patientId in (tqdm(patients,desc="Creating Tensor",ascii = ' |/-\|/-\=') if self.verbose else patients): patientData = data[data['Id'] == patientId].drop('Id',axis='columns') self.dataset.append(patientData.values)
def df_to_csv(df: pd.core.frame.DataFrame, outdir: str, outfile: str = "pydamage_results.csv"): """Write Pydamage results to disk Args: df(pandas DataFrame): Pydamage results DataFrame outdir (str): Path to output directory """ df = df.round(3) if not outdir: outdir = "." df.to_csv(f"{outdir}/{outfile}")
def _output(self, output_file_name: str or bool, encoding: str, output_data: pd.core.frame.DataFrame): """ The function groups the data frame by keys and prints the required values to the console or writes to a file Parameters ---------- output_file_name : str or bool The name of the exit file to open, or bool operator encoding : str The encoding in which the files will be opened group_data : pd.core.frame.DataFrame Sorted dataFrame with the required data for output Returns ------- None. """ if output_file_name: with open(Path(output_file_name), encoding=encoding, mode='w') as file: for label, g in output_data.groupby(['mm', 'dd', 'yyyy'], sort=False): file.write(str(g['ratio'][0]) + '\t') file.write(str(label[0]) + '\t') file.write(str(label[1]) + '\t') file.write(str(label[2]) + '\t') file.write(str(g['mean_maxt'][0]) + '\n') if g['kind'][0] == 1: for i, j in g.sort_values(['wind', 'meteo']).iterrows(): file.write('\t' + str(j['avgt']) + ' ') file.write(str(j['maxt']) + ' ') file.write(str(j['mint']) + ' ') file.write(str(j['wind']) + ' ') file.write(str(j['hum']) + ' ') file.write(str(j['meteo']) + '\n') print('OK') else: for label, g in output_data.groupby(['mm', 'dd', 'yyyy'], sort=False): print(g['ratio'][0], *label[0:3], g['mean_maxt'][0], sep='\t') if g['kind'][0] == 1: for i, j in g.sort_values(['wind', 'meteo']).iterrows(): print('\t', j['avgt'], j['maxt'], j['mint'], j['wind'], j['hum'], j['meteo'], sep=' ')
def clean_target(f: pd.core.frame.DataFrame) -> pd.core.frame.DataFrame: """Remove rows with target values that cannot be converted to float or nan.""" # experimental value (y) y = f.iloc[:, -1] # indices that cannot be converted to float nonfloatable_indices_ = nonfloatable_indices(y.values) # drop bad indices f = f.drop(f.index[nonfloatable_indices_], axis=0) # remove nan return f.dropna(axis=0)
def to_csv(frame: pd.core.frame.DataFrame, *, name: str): save_name = '{}.csv'.format(name) if not os.path.exists(save_name): try: frame.to_csv(save_name, encoding='utf-8', index=False) print('{}保存成功!'.format(name)) except Exception as e: print('{}保存失败!'.format(name)) traceback.print_exc() else: print('{}已存在'.format(name))
def add_hydrogen_to_source_data(data:pd.core.frame.DataFrame)->str: """ Takes the dataframe containing the smiles, adds the hydrogen back in, saves it in the data folder with a new name. The new filename is the output to the function. """ def addHs(smile): m=Chem.MolFromSmiles(smile) m=Chem.AddHs(m) return Chem.MolToSmiles(m) data['SMILES'] = data['SMILES'].map(lambda x: addHs(x)) output_file = 'data/data_dups_removed_with_H.csv' data.to_csv(output_file) return output_file
def frequency_std_database(data: pd.core.frame.DataFrame): """Lors du calcul de le frequence: - Difference entre deux lignes successives - Moyenne de l'écart de type datetime.timedelta : W jours, X heures, Y minutes et Z secondes """ data = data.diff() data = data.iloc[1:] mean_data = data.mean() std_data = data.std() return mean_data, std_data
def save(df: pd.core.frame.DataFrame, save_fp: str): # TODO: DONE """Saves specified DataFrame to save_fp. Arguments: df {pd.core.frame.DataFrame} -- DataFrame to be saved save_fp {str} -- Path where the DataFrame is saved Returns: None -- Prints success message""" df.to_csv(path_or_buf=save_fp, index=False, mode='w') print("File has been saved to '{}'".format(save_fp))
def toListOfTuple(self, df:pd.core.frame.DataFrame) -> List[Tuple]: """convert demand data to list of Tuple [[timestamp, entitytag, revisionNo, rmse,mae,mape,rmse%],] Args: df (pd.core.frame.DataFrame): demand data dataframe Returns: List[tuple]: list of tuple of revisionwise error data """ # replacing entity_tag with constituents name replace_values = {"WRLDCMP.SCADA1.A0047000":"WR-Total","WRLDCMP.SCADA1.A0046980": "Maharastra", "WRLDCMP.SCADA1.A0046957":"Gujarat", "WRLDCMP.SCADA1.A0046978":"Madhya Pradesh", "WRLDCMP.SCADA1.A0046945":"Chattisgarh", "WRLDCMP.SCADA1.A0046962":"Goa", "WRLDCMP.SCADA1.A0046948":"DD", "WRLDCMP.SCADA1.A0046953":"DNH"} df = df.replace({"ENTITY_TAG": replace_values}) df['DATE_KEY'] = df['DATE_KEY'].astype('str') records = df.to_records(index=False) listOfTuple = list(records) return listOfTuple
def glycaemic_variability( df: pd.core.frame.DataFrame, colum_name: str = "Sensor Glucose (mg/dL)", windows: Dict[str,int] = { "weekly": 7, "monthly": 30 }, kind: str = "NAIVE" ) -> NoReturn: """ """ methods = { "NAIVE": lambda x: 100 * pd.Series.std(x) / pd.Series.mean(x) } kind = kind.upper() if kind in methods.keys(): _statistic = df.groupby(df.index.date)[colum_name].apply(methods[kind]) # Plot the main series : _statistic.plot(**{"label":"daily"}) # Plot the moving averages : for key, value in windows.items(): ax = _statistic.rolling(value).mean().plot(**{"label":key}) _global_mean = _statistic.mean() plt.axhline(_global_mean, **{"label": f"mean = {round(_global_mean,1)}", "c": "blue"}) plt.legend() plt.title(f"Glycaemic Variability, assesment method : {kind}")
def get_sent_score(data_frame: pd.core.frame.DataFrame, col_list: list) -> dict: # calculates total sentiment score for every row of dataframe sent_analyzer = SentimentIntensityAnalyzer() score_list = [] neg_total = 0 pos_total = 0 neutral_total = 0 total = 0 for index, row in data_frame.iterrows(): for col in col_list: score = sent_analyzer.polarity_scores(str(row[col])) score_list.append((row[col], score)) neg_total += score['neg'] pos_total += score['pos'] neutral_total += score['neu'] total += 1 # return average sentiment score of data score_dict = {} score_dict["neg"] = neg_total / total score_dict["pos"] = pos_total / total score_dict["neutral"] = neutral_total / total print(score_dict) return score_dict
def deduplicate_column_values(data: pd.core.frame.DataFrame, reserved_cols: List[str] = [], max_obs: int = 65536) -> pd.core.frame.DataFrame: """Delete columns with the same values as a later column. Args: df: A DataFrame. reserved_cols: Names of columns to exclude from deduplication. max_obs: The number of observations to sample if df has more than that many observations. Returns: A DataFrame containing only the last instance of each unique column. """ comparison_data = data.drop(reserved_cols, axis=1).sample(n=min(max_obs, data.shape[0]), replace=False) deduplicated_cols = list( comparison_data.T.drop_duplicates(keep="last").index) deduplicated_data = data[reserved_cols + deduplicated_cols] duplicated_cols = [ col for col in data if col not in deduplicated_data.columns ] if duplicated_cols: print(f'{", ".join(duplicated_cols)} dropped for having identical ' + "values as another feature") return deduplicated_data
def mean_firing_rate_by( df: pd.core.frame.DataFrame, spiketimes_col: str = "spiketimes", spiketrain_col: str = "spiketrain", t_start: float = None, t_stop: float = None, ): """ Estimate the mean firing rate of each spiketrain. Firing rate caluclated by summing spikes and dividing by total time. Args: df: A pandas DataFrame containing spiketimes indexed by spiketrain spiketimes_col: The label of the column containing spiketimes spiketrain_col: The label of the column identifying the spiketrain responsible for the spike t_start: Time point at which to start. Defaults to time of first spike in df. t_stop: Maximum timepoint. Defaults to last spike in df. Returns: A DataFrame containing mean firing rate by neuron """ if t_start is None: t_start = df[spiketimes_col].min() if not t_stop: t_stop = df[spiketimes_col].max() return (df.groupby(spiketrain_col).apply( lambda x: spiketimes.statistics.mean_firing_rate( x[spiketimes_col].values, t_start=t_start, t_stop=t_stop, )).reset_index().rename(columns={0: "mean_firing_rate"}))
def diffmeans_test_by( df: pd.core.frame.DataFrame, n_boot: int = 1000, spikecount_col: str = "spike_count", spiketrain_col: str = "spiketrain", condition_col: str = "cond", ): """ Calculates the difference between means of spike counts for each spike in a data frame and also tests significance using a permutation test. Args: df: A pandas DataFrame containing spiketimes indexed by spiketrain n_boot: The number of permutation replicates to draw. spikecount_col: The label of the column containing spikecounts spiketrain_col: The label of the column identifying the spiketrain responsible for the spike condition_col: A categorical column containing 0 for the baseline condition and 1 for the experimental condition Returns: A pandas DataFrame containing one row per spiketrain with columns {'spiketrain', 'diff_of_means', 'p'} """ return (df.groupby(spiketrain_col).apply(lambda x: pd.Series( spiketimes.statistics.diffmeans_test( x[spikecount_col].values, x[condition_col].values, n_boot=n_boot, ))).reset_index().rename(columns={ 0: "diff_of_means", 1: "p" }))
def auc_roc_test_by( df: pd.core.frame.DataFrame, n_boot: int = 1000, return_distance_from_chance: bool = False, spikecount_col: str = "spike_count", spiketrain_col: str = "spiketrain", condition_col: str = "cond", ): """ Calculates the Area Under the Receiver Operating Characteristic Curve of spike counts for each spiketrain. The AUCROC can be used as a metric of the separability of two distrobutions. Each spiketrain must have been recorded in both conditions during multiple trials. Significance tested using a permutation test. Args: df: A pandas DataFrame containing spiketimes indexed by spiketrain n_boot: The number of permutation replicates to draw. spikecount_col: The label of the column containing spikecounts spiketrain_col: The label of the column identifying the spiketrain responsible for the spike condition_col: A categorical column containing 0 for the baseline condition and 1 for the experimental condition return_distance_from_chance: If True, returns distance from 0.5 Returns: A pandas DataFrame containing one row per spiketrain with columns {'spiketrain', 'AUCROC', 'p'} """ return (df.groupby(spiketrain_col).apply(lambda x: pd.Series( spiketimes.statistics.auc_roc_test( x[spikecount_col].values, x[condition_col].values, n_boot=n_boot, return_distance_from_chance=return_distance_from_chance, ))).reset_index().rename(columns={ 0: "AUCROC", 1: "p" }))
def cat2onehot_list_loop(data: pd.core.frame.DataFrame, varnames: List[str]) -> pd.core.frame.DataFrame: data1 = data.copy() for col in varnames: df_oh = pd.get_dummies(data1[col]) data1.drop(col, axis=1, inplace=True) data1 = pd.concat((data1, df_oh), axis=1) return data1
def matchPathToMetadata(metadata: pd.core.frame.DataFrame, maps_folder_path: str, filesnames_column: str): ''' Match the images found in path with the metadata. Input(s): metadata: dataframe containing metadata maps_folder_path: path where the maps images are found filesnames_column: column containing the images names, in the metadata dataframe Output(s): df_metadata: metadata dataframe containing the images paths, when they were found ''' maps_paths = getImagesPaths(maps_folder_path) maps_names = [] for path in maps_paths: maps_names.append(getImageName(path)) maps_names = np.asarray(maps_names) _maps_paths = [] for ind, filename in metadata[filesnames_column].iteritems(): image_name = getImageName(filename) match = (image_name == maps_names).astype('int') if np.sum(match) == 1: _maps_paths.append(maps_paths[np.argmax(match)]) else: _maps_paths.append(np.nan) metadata['path'] = _maps_paths df_metadata = metadata.dropna(subset=['path']).reset_index(drop=True) return df_metadata
def create_orientations_widget(self, orientations: pd.core.frame.DataFrame)\ -> List[vtk.vtkInteractionWidgetsPython.vtkPlaneWidget]: """Create plane widget for each orientation with interactive recompute of the model Args: orientations (pd.core.frame.DataFrame): Returns: List[vtkInteractionWidgetsPython.vtkPlaneWidget]: """ colors = self._get_color_lot(is_faults=True, is_basement=False) widget_list = [] # for index, pt, nrm in zip(i, pts, nrms): self._color_lot = self._get_color_lot(is_faults=True, is_basement=False, index='id') for index, val in orientations.iterrows(): widget = self.p.add_plane_widget(self.call_back_plane, normal=val[['G_x', 'G_y', 'G_z']], origin=val[['X', 'Y', 'Z']], bounds=self.extent, factor=0.15, implicit=False, pass_widget=True, test_callback=False, color=colors[val['surface']]) widget.WIDGET_INDEX = index widget_list.append(widget) return widget_list
def shape_address_col(self, df_: pd.core.frame.DataFrame): '''addressカラムを住所全体の文字列になるように変換する ''' assert {'prefecture_id', 'ward_city_id'}.issubset(df_.columns) df = df_.copy() self.prefecture_city_id_info['ward_city_id'] = [ int('%d%d' % (prefecture_id, ward_city_id)) for prefecture_id, ward_city_id in zip( self.prefecture_city_id_info.prefecture_id.tolist(), self.prefecture_city_id_info.ward_city_id.tolist()) ] self.prefecture_city_id_info.index = self.prefecture_city_id_info.prefecture_id prefecture_dic = self.prefecture_city_id_info.to_dict( )['prefecture_name'] self.prefecture_city_id_info.index = self.prefecture_city_id_info.ward_city_id ward_dic = self.prefecture_city_id_info.to_dict()['ward_city_name'] df['ward_city_id'] = [ int('%d%d' % (prefecture_id, ward_city_id)) for prefecture_id, ward_city_id in zip(df.prefecture_id.tolist(), df.ward_city_id.tolist()) ] df['prefecture_name'] = df['prefecture_id'].map(prefecture_dic) df['ward_city_name'] = df['ward_city_id'].map(ward_dic) df = df.dropna(subset=['prefecture_name', 'ward_city_name']) df['address'] = [ prefecture_name + ward_city_name + address if address == address and address else prefecture_name + ward_city_name + registered_address for prefecture_name, ward_city_name, address, registered_address in zip(df.prefecture_name.tolist(), df.ward_city_name.tolist(), df.address.tolist(), df.registered_address.tolist()) ] return df