def display_dict_as_table_vertical(input_dict: Dict) -> None: pipe( input_dict, dict_to_struct_table_vertical, render_struct_table, display_html, )
def smallest_number_of_samples(found_datasets): common_mzs = common_range(found_datasets) count_mzs = partial(count_samples_in_common_range, common_mzs=common_mzs) estimate = pipe( for_each(pipe(get_mz_axis, count_mzs), lazy=False), report_value('mzs amounts'), np.min, report_value('minimal number of mzs') ) return estimate(found_datasets)
def plot_feature_importance(coefficients: DataFrame, limit: int = None) -> None: with pd.option_context('display.max_rows', None, 'display.max_columns', None): if not limit: limit = len(coefficients) coefficients = coefficients.reindex(coefficients.abs().sort_values( ascending=True, by='mean').index) coefficients = coefficients[-limit:] plt.figure(figsize=(4, 7 * (limit / 25))) plt.tick_params( axis='y', # changes apply to the x-axis which='both', # both major and minor ticks are affected left=False, # ticks along the bottom edge are off ) # plt.tick_params(axis='x', labelcolor='#414141', color='#b9b8b9') rects = plt.barh( coefficients.index, coefficients['mean'], color="#f89f76", ) max_width = pipe( rects, map(lambda rect: rect.get_width()), max, ) for index, rect in enumerate(rects): number = coefficients.iloc[index]['mean'] plt.text( max_width * 1.1 + (-0.02 if number < 0 else 0), rect.get_y() + 0.2, f'{number:.3f}', # color='#060606', ha='left', ) # plt.gcf().patch.set_facecolor('#fdeadd') plt.margins(y=0.01) # plt.gca().patch.set_facecolor('white') plt.gca().spines['top'].set_visible(False) plt.gca().spines['bottom'].set_visible(False) plt.gca().spines['right'].set_visible(False) plt.gca().spines['right'].set_linewidth(1) plt.gca().spines['right'].set_color('#b9b8b9') plt.gca().spines['left'].set_linewidth(1) plt.gca().spines['left'].set_color('#b9b8b9') plt.gca().set_axisbelow(True) import matplotlib as mpl mpl.rcParams['figure.dpi'] = 100 plt.grid(axis='x') plt.gca().xaxis.grid(linestyle='--', which='major', linewidth=1) plt.gca().get_xgridlines()[1].set_linestyle('-')
def join_repeats_and_folds_cv_results( results: List[ModelCVResult]) -> ModelResult: return ModelResult(**pipe( results, join_repeats_cv_results, join_folds_cv_result, ))
def format_end_to_end_metrics_table( optimized_metrics_by_method: Dict[str, ClassificationMetricsWithStatistics], default_metrics_by_method: Dict[str, ClassificationMetricsWithStatistics], include_header: bool = True, ) -> List[List[str]]: methods = keys(optimized_metrics_by_method) def get_header() -> List[str]: return ['', 'Optimized ROC/AUC', 'Default ROC/AUC', 'Optimized PR/AUC', 'Default PR/AUC'] def get_line(_method: str) -> List: _optimized_metrics = optimized_metrics_by_method[_method] _default_metrics = default_metrics_by_method[_method] return [ format_method(_method), _optimized_metrics['roc_auc'], _default_metrics['roc_auc'], _optimized_metrics['average_precision'], _default_metrics['average_precision'], ] return [ *([get_header()] if include_header else []), *pipe( [get_line(method) for method in methods], partial(sorted, key=lambda i: i[1].mean, reverse=True), map(lambda line: [ line[0], *[cell.format_short() for cell in line[1:]], ]), ), ]
def remove_indexes(iterable: Iterable, indexes: List[int]) -> Iterable: return pipe( iterable, add_index, filter(decorate_unpack(lambda i, _: i not in indexes)), map(get(1)), list, )
def test_pipe(): def valueadd1(v): return v + 1 def valuemultiply2(v): return v * 2 def listadd1(l): return functional.map(lambda e: e + 1, l) def listmultiply2(l): return functional.map(lambda e: e * 2, l) assert functional.pipe([valueadd1, valuemultiply2, valueadd1])(1) == 5 assert functional.pipe([ listadd1, listmultiply2, listadd1 ])(functional.range(1, 11)) == \ [ 5, 7, 9, 11, 13, 15, 17, 19, 21, 23 ]
def average_list_of_confusion_matrices( matrices: List[ConfusionMatrix]) -> ConfusionMatrixWithStatistics: return pipe( matrices, partial(map, object2dict), list, average_list_dicts, partial( valmap, lambda value: ValueWithStatistics( mean=value[0], std=value[1], ci=None)), lambda matrix: ConfusionMatrixWithStatistics(**matrix), )
def compare_metrics_in_table( metrics_for_methods: Dict[str, ClassificationMetricsWithStatistics], include: Tuple[str, ...] = ('balanced_accuracy', 'roc_auc', 'recall', 'fpr'), format_method_name: Callable[[str], str] = identity, include_ci_for: Set[str] = None, include_delta: bool = False, ) -> List[List]: if include_ci_for is None: include_ci_for = include def get_line( method: str, metrics: Union[ClassificationMetrics, ClassificationMetricsWithStatistics] ): return [ format_method_name(method), *pipe( [ [ metrics[metric].mean, ( metrics[metric].mean - get_max_metric_value(metric, metrics_for_methods.values()) ) if include_delta else None, ] + ([format_ci(metrics[metric].ci)] if metric in include_ci_for else []) for metric in include ], flatten, compact, ), ] lines = pipe( [get_line(method, metrics) for method, metrics in metrics_for_methods.items()], partial(sorted, key=get(1), reverse=True), ) return format_structure( format_decimal, [ [ '', *flatten( map( lambda metric: [format_metric_short(metric), *(['Δ'] if include_delta else [])] + (['95% CI'] if metric in include_ci_for else []), include ) ) ], *lines, ], )
def run(self): gather_metadata = pipe( text_files, list, partial(LuigiTqdm, task=self), progress_bar('gathering metadata'), for_each(spectrum_metadata, lazy=False), np.vstack, partial(pd.DataFrame, columns=['R', 'X', 'Y']) ) data_path = os.path.join(self.INPUT_DIR, self.dataset) metadata = gather_metadata(data_path) with self.output().open('w') as outfile: save_csv(metadata, outfile)
def structure_feature_importance(series: Series) -> List[FeatureImportanceItem]: return pipe( series, pandas.Series.items, partial( mapl, decorate_unpack( lambda feature, importance: { 'feature': feature, 'importance': importance } ) ), )
def get_merged_roc_point(_roc_curves: List[Tuple[np.array, np.array, np.array]], threshold: float) -> Tuple[float, float]: if threshold > 1: threshold = 1 merged_fpr, merged_tpr = pipe( _roc_curves, map(lambda curve: get_roc_point_by_threshold(threshold, *curve)), list, partial(np.mean, axis=0), ) return merged_fpr, merged_tpr
def count_values_and_align(series1: Series, series2: Series) -> Tuple[Series, Series]: values1 = series1.copy().value_counts().sort_index() values2 = series2.copy().value_counts().sort_index() indexes = pipe( set(values1.index).union(set(values2.index)), list, sorted, ) for values in (values1, values2): for index in indexes: try: values.loc[index] except KeyError: values.loc[index] = 0 return values1.sort_index(), values2.sort_index()
def get_line( method: str, metrics: Union[ClassificationMetrics, ClassificationMetricsWithStatistics] ): return [ format_method_name(method), *pipe( [ [ metrics[metric].mean, ( metrics[metric].mean - get_max_metric_value(metric, metrics_for_methods.values()) ) if include_delta else None, ] + ([format_ci(metrics[metric].ci)] if metric in include_ci_for else []) for metric in include ], flatten, compact, ), ]
def stock_history(init_investment, monthly_contribution, stock_prices): stock_prices = F.map(stock_prices, F.pipe(lambda x: x.split(" "), lambda x: [int(a) for a in x])) sell_prices = stock_prices[-1] total_earning = 0 total_investment = 0 max_earning_rate = [1 for _ in sell_prices] for month in range(len(stock_prices) -2, -1, -1): current_prices = stock_prices[month] earning_rate = [sell_prices[i] / price for i, price in enumerate(current_prices)] max_earning_rate = [max(a, b) for a, b in zip(max_earning_rate, earning_rate)] money = init_investment if month == 0 else monthly_contribution total_earning += money * max(max_earning_rate) total_investment += money return int(round(total_earning - total_investment))
def get_cluster_mapping_by_prevalence( y_pred: Series, y_true: Series, ) -> Dict: def get_1_prevalence(distribution: Series) -> int: try: prevalence_1 = (distribution[1] / distribution.sum()) except KeyError: prevalence_1 = 0 return prevalence_1 return pipe( get_counts_per_cluster(y_pred, y_true), dict.items, partial(map_tuples, lambda index, distribution: (index, get_1_prevalence(distribution))), sorted(key=get(1)), enumerate, partial(map_tuples, lambda index, item: (item[0], index)), list, from_items, )
def spectrum_sampling_pipe(mzs): return pipe(try_loadtxt, np.transpose, as_arguments_of(partial(np.interp, mzs)), np.ravel, partial(np.ndarray.astype, dtype=np.float32))
) import luigi import numpy as np from components.io_utils import text_files, try_loadtxt from components.spectrum.resampling import estimate_new_axis from pipeline._base import * def get_mzs_from_content(content: np.ndarray) -> np.ndarray: return content[:, 0] get_mz_axis = pipe( text_files, partial(take, n_elements=1), as_arguments_of(try_loadtxt), get_mzs_from_content ) mz_range = pipe( report_value('dataset'), get_mz_axis, broadcast(np.min, np.max), np.array, report_value('mz range') ) def common_part(first, second): return max(first[0], second[0]), min(first[1], second[1])
def test_pipe(self): func_pipe = F.pipe( F.curryr(F.map)(lambda x: x + 1), F.curryr(F.map)(lambda x: x * 2)) self.assertEqual(func_pipe(_list), [4, 6, 8, 10])
import os from functools import partial from functional import pipe import numpy as np import pandas as pd def rooted_content(root: str): return [os.path.join(root, element) for element in os.listdir(root)] subdirectories = pipe(rooted_content, partial(filter, os.path.isdir)) files = pipe(rooted_content, partial(filter, os.path.isfile)) def has_extension(path, extension: str='.txt'): return os.path.splitext(path)[1].lower() == extension is_text = partial(has_extension, extension='.txt') text_files = pipe(files, partial(filter, is_text)) def try_loadtxt(fname: str): try: return np.loadtxt(fname) except ValueError: return pd.read_csv(fname, delimiter=' ', header=None, decimal=',').values
def format_style(style: Dict) -> str: return pipe( (f'{key}: {value}' for key, value in style.items()), lambda _styles: ";".join(_styles), )
def get_datasets( base: DataFrame, impute: bool = True, ) -> Dict[str, DataFrame]: label = 'ACV2' data = pipe( base, format_columns, partial( select_features, features=( 'RWT', 'EM', 'LVMI', 'IVSD', 'PP', 'GS', 'LA_Adi', 'SBP', 'AM', 'LVPWD', 'MVE_VEL', 'LVIDD', 'LA_GS', 'RMVEA', 'PR', 'SM', 'LA_Asi', 'REEM', 'REAM', 'IVRT', 'LAEDVi', 'LAESVi', 'MVA_VEL', 'AO_DIAM', 'EF_MOD', 'LA_A_4ch', 'MV_DECT', 'ESV_MODI', 'LA_EF_4ch', 'SV_MODI' ) ), impute_missing if impute else identity, ) data = DataFrame( StandardScaler().fit_transform(data), columns=data.columns, index=data.index, ) return dict( base=base, clustering=data, clustering_correlated_removed=select_features( data, ( 'RWT', 'EM', 'LVMI', 'GS', 'SBP', 'AM', 'LVPWD', 'MVE_VEL', 'LVIDD', 'LA_GS', 'RMVEA', 'PR', 'SM', 'REEM', 'IVRT', 'LAESVi', 'MVA_VEL', 'AO_DIAM', 'EF_MOD', 'MV_DECT', 'ESV_MODI', 'LA_EF_4ch', 'SV_MODI' ) ), varsellcm=select_features( data, ( 'RWT', 'EM', 'LVMI', 'GS', 'SBP', 'AM', 'LVPWD', 'MVE_VEL', 'LVIDD', 'LA_GS', 'RMVEA', 'SM', 'REEM', 'IVRT', 'LAESVi', 'MVA_VEL', 'AO_DIAM', 'EF_MOD', 'MV_DECT', 'ESV_MODI', 'LA_EF_4ch', 'SV_MODI' ) ), manual=select_features( data_all, ( 'SBP', 'PP', 'LVMI', 'PR', 'REEM', 'ESV_MODI', 'LAESVI', 'LA_GS', 'MVE_VEL', 'MVA_VEL', 'RMVEA', 'AM', 'EM', 'GS', 'MV_DECT' ) ), feature_selection_subsets={ 'manual': [ 'SBP', 'PP', 'LVMI', 'PR', 'REEM', 'ESV_MODI', 'LAESVI', 'LA_GS', 'MVE_VEL', 'MVA_VEL', 'RMVEA', 'AM', 'EM', 'GS', 'MV_DECT' ], 'normalized_cut_15': [ 'PR', 'IVRT', 'LVIDD', 'PP', 'AO_DIAM', 'IVSD', 'LVMI', 'ESV_MODI', 'SV_MODI', 'RWT', 'AM', 'REAM', 'SBP', 'LVPWD', 'SM' ], }, varsellcm_importance=[ ['Variables', 'Discrim. Power', 'Discrim. Power (%)', 'Discrim. Power (% cum)'], ['REAM', 817.64, 9.31, 9.31], ['LAEDVI', 699.19, 7.96, 17.27], ['EM', 629.51, 7.17, 24.44], ['LA_ADI', 596.19, 6.79, 31.23], ['RMVEA', 582.88, 6.64, 37.87], ['REEM', 481.04, 5.48, 43.34], ['LAESVI', 414.47, 4.72, 48.06], ['LVMI', 345.93, 3.94, 52.00], ['LA_ASI', 342.89, 3.90, 55.91], ['MVA_VEL', 335.27, 3.82, 59.72], ['IVSD', 309.54, 3.52, 63.25], ['LA_GS', 309.44, 3.52, 66.77], ['SBP', 295.45, 3.36, 70.14], ['LA_A_4CH', 284.82, 3.24, 73.38], ['LA_EF_4CH', 273.40, 3.11, 76.49], ['PP', 262.97, 2.99, 79.49], ['MV_DECT', 255.25, 2.91, 82.39], ['AM', 253.88, 2.89, 85.28], ['LVPWD', 244.71, 2.79, 88.07], ['RWT', 233.04, 2.65, 90.72], ['AO_DIAM', 162.76, 1.85, 92.58], ['MVE_VEL', 156.93, 1.79, 94.36], ['IVRT', 142.54, 1.62, 95.99], ['SM', 136.16, 1.55, 97.54], ['ESV_MODI', 114.66, 1.31, 98.84], ['LVIDD', 39.61, 0.45, 99.29], ['SV_MODI', 28.07, 0.32, 99.61], ['PR', 15.19, 0.17, 99.79], ['GS', 13.58, 0.15, 99.94], ['EF_MOD', 5.15, 0.06, 100.00], ], varsellcm_importance_k_2=[ ['Variables', 'Discrim. Power', 'Discrim. Power (%)', 'Discrim. Power (% cum)'], ['REAM', 643.22, 9.79, 9.79], ['EM', 557.63, 8.49, 18.28], ['RMVEA', 428.47, 6.52, 24.81], ['LAEDVI', 396.74, 6.04, 30.85], ['REEM', 385.05, 5.86, 36.71], ['LA_ADI', 337.76, 5.14, 41.85], ['LA_GS', 293.81, 4.47, 46.33], ['IVSD', 282.91, 4.31, 50.63], ['MVA_VEL', 265.06, 4.04, 54.67], ['SBP', 251.84, 3.83, 58.50], ['MV_DECT', 251.50, 3.83, 62.33], ['LVMI', 249.36, 3.80, 66.13], ['LVPWD', 238.89, 3.64, 69.77], ['RWT', 232.57, 3.54, 73.31], ['PP', 231.99, 3.53, 76.84], ['LAESVI', 209.58, 3.19, 80.03], ['LA_A_4CH', 201.95, 3.07, 83.11], ['LA_EF_4CH', 185.32, 2.82, 85.93], ['AO_DIAM', 183.30, 2.79, 88.72], ['LA_ASI', 161.47, 2.46, 91.18], ['AM', 149.00, 2.27, 93.45], ['IVRT', 129.09, 1.97, 95.41], ['MVE_VEL', 128.61, 1.96, 97.37], ['SM', 116.14, 1.77, 99.14], ['ESV_MODI', 21.25, 0.32, 99.46], ['GS', 15.08, 0.23, 99.69], ['EF_MOD', 12.04, 0.18, 99.87], ['LVIDD', 8.23, 0.13, 100.00], ], label=label, y_true=base[label], )
from functional import ( as_arguments_of, for_each, pipe, progress_bar, ) import numpy as np from components.io_utils import text_files, try_loadtxt def spectrum_sampling_pipe(mzs): return pipe(try_loadtxt, np.transpose, as_arguments_of(partial(np.interp, mzs)), np.ravel, partial(np.ndarray.astype, dtype=np.float32)) if __name__ == '__main__': DATASET = sys.argv[1] MZ_AXIS = sys.argv[2] DESTINATION = sys.argv[3] new_axis = np.loadtxt(MZ_AXIS, delimiter=',') resampled = pipe( text_files, list, progress_bar('resampling dataset'), for_each(spectrum_sampling_pipe(new_axis), parallel=True, chunksize=800))(DATASET) with open(DESTINATION, 'wb') as outfile: np.save(outfile, resampled)
import os from functools import partial from functional import pipe only_directories = partial(filter, os.path.isdir) def rooted_listdir(root): return [os.path.join(root, path) for path in os.listdir(root)] subdirectories = pipe(rooted_listdir, only_directories, list)
def get_component(filename, component): match = metadata_pattern.search(filename) if match is None: raise ValueError(filename) metadata = filename[match.start() + 1:match.end() - 1] metadata = metadata[metadata.find(component) + 1:] metadata = metadata[:component_length(metadata)] return int(metadata) spectrum_metadata = pipe( broadcast( partial(get_component, component='R'), partial(get_component, component='X'), partial(get_component, component='Y') ), np.array ) class AssembleMetadata(BaseTask): INPUT_DIR = os.path.join(BaseTask.INPUT_DIR, 'raw') OUTPUT_DIR = os.path.join(BaseTask.OUTPUT_DIR, 'metadata') dataset = luigi.Parameter(description='Dataset to get metadata from') def output(self): return self._as_target("{0}.csv".format(self.dataset)) def run(self):