Пример #1
0
def display_dict_as_table_vertical(input_dict: Dict) -> None:
    pipe(
        input_dict,
        dict_to_struct_table_vertical,
        render_struct_table,
        display_html,
    )
def smallest_number_of_samples(found_datasets):
    common_mzs = common_range(found_datasets)
    count_mzs = partial(count_samples_in_common_range, common_mzs=common_mzs)
    estimate = pipe(
        for_each(pipe(get_mz_axis, count_mzs), lazy=False),
        report_value('mzs amounts'),
        np.min,
        report_value('minimal number of mzs')
    )
    return estimate(found_datasets)
Пример #3
0
def plot_feature_importance(coefficients: DataFrame,
                            limit: int = None) -> None:
    with pd.option_context('display.max_rows', None, 'display.max_columns',
                           None):
        if not limit:
            limit = len(coefficients)

        coefficients = coefficients.reindex(coefficients.abs().sort_values(
            ascending=True, by='mean').index)
        coefficients = coefficients[-limit:]

    plt.figure(figsize=(4, 7 * (limit / 25)))

    plt.tick_params(
        axis='y',  # changes apply to the x-axis
        which='both',  # both major and minor ticks are affected
        left=False,  # ticks along the bottom edge are off
    )
    # plt.tick_params(axis='x', labelcolor='#414141', color='#b9b8b9')

    rects = plt.barh(
        coefficients.index,
        coefficients['mean'],
        color="#f89f76",
    )

    max_width = pipe(
        rects,
        map(lambda rect: rect.get_width()),
        max,
    )

    for index, rect in enumerate(rects):
        number = coefficients.iloc[index]['mean']
        plt.text(
            max_width * 1.1 + (-0.02 if number < 0 else 0),
            rect.get_y() + 0.2,
            f'{number:.3f}',
            # color='#060606',
            ha='left',
        )
    # plt.gcf().patch.set_facecolor('#fdeadd')
    plt.margins(y=0.01)
    # plt.gca().patch.set_facecolor('white')
    plt.gca().spines['top'].set_visible(False)
    plt.gca().spines['bottom'].set_visible(False)
    plt.gca().spines['right'].set_visible(False)
    plt.gca().spines['right'].set_linewidth(1)
    plt.gca().spines['right'].set_color('#b9b8b9')
    plt.gca().spines['left'].set_linewidth(1)
    plt.gca().spines['left'].set_color('#b9b8b9')
    plt.gca().set_axisbelow(True)

    import matplotlib as mpl
    mpl.rcParams['figure.dpi'] = 100

    plt.grid(axis='x')

    plt.gca().xaxis.grid(linestyle='--', which='major', linewidth=1)
    plt.gca().get_xgridlines()[1].set_linestyle('-')
Пример #4
0
def join_repeats_and_folds_cv_results(
        results: List[ModelCVResult]) -> ModelResult:
    return ModelResult(**pipe(
        results,
        join_repeats_cv_results,
        join_folds_cv_result,
    ))
Пример #5
0
def format_end_to_end_metrics_table(
    optimized_metrics_by_method: Dict[str, ClassificationMetricsWithStatistics],
    default_metrics_by_method: Dict[str, ClassificationMetricsWithStatistics],
    include_header: bool = True,
) -> List[List[str]]:
    methods = keys(optimized_metrics_by_method)

    def get_header() -> List[str]:
        return ['', 'Optimized ROC/AUC', 'Default ROC/AUC', 'Optimized PR/AUC', 'Default PR/AUC']

    def get_line(_method: str) -> List:
        _optimized_metrics = optimized_metrics_by_method[_method]
        _default_metrics = default_metrics_by_method[_method]
        return [
            format_method(_method),
            _optimized_metrics['roc_auc'],
            _default_metrics['roc_auc'],
            _optimized_metrics['average_precision'],
            _default_metrics['average_precision'],
        ]

    return [
        *([get_header()] if include_header else []),
        *pipe(
            [get_line(method) for method in methods],
            partial(sorted, key=lambda i: i[1].mean, reverse=True),
            map(lambda line: [
                line[0],
                *[cell.format_short() for cell in line[1:]],
            ]),
        ),
    ]
Пример #6
0
 def remove_indexes(iterable: Iterable, indexes: List[int]) -> Iterable:
     return pipe(
         iterable,
         add_index,
         filter(decorate_unpack(lambda i, _: i not in indexes)),
         map(get(1)),
         list,
     )
Пример #7
0
def test_pipe():
    def valueadd1(v):
        return v + 1

    def valuemultiply2(v):
        return v * 2

    def listadd1(l):
        return functional.map(lambda e: e + 1, l)

    def listmultiply2(l):
        return functional.map(lambda e: e * 2, l)

    assert functional.pipe([valueadd1, valuemultiply2, valueadd1])(1) == 5
    assert functional.pipe([ listadd1,
                             listmultiply2,
                             listadd1 ])(functional.range(1, 11)) == \
           [ 5, 7, 9, 11, 13, 15, 17, 19, 21, 23 ]
Пример #8
0
def average_list_of_confusion_matrices(
        matrices: List[ConfusionMatrix]) -> ConfusionMatrixWithStatistics:
    return pipe(
        matrices,
        partial(map, object2dict),
        list,
        average_list_dicts,
        partial(
            valmap, lambda value: ValueWithStatistics(
                mean=value[0], std=value[1], ci=None)),
        lambda matrix: ConfusionMatrixWithStatistics(**matrix),
    )
Пример #9
0
def compare_metrics_in_table(
    metrics_for_methods: Dict[str, ClassificationMetricsWithStatistics],
    include: Tuple[str, ...] = ('balanced_accuracy', 'roc_auc', 'recall', 'fpr'),
    format_method_name: Callable[[str], str] = identity,
    include_ci_for: Set[str] = None,
    include_delta: bool = False,
) -> List[List]:

    if include_ci_for is None:
        include_ci_for = include

    def get_line(
        method: str, metrics: Union[ClassificationMetrics, ClassificationMetricsWithStatistics]
    ):
        return [
            format_method_name(method),
            *pipe(
                [
                    [
                        metrics[metric].mean,
                        (
                            metrics[metric].mean -
                            get_max_metric_value(metric, metrics_for_methods.values())
                        ) if include_delta else None,
                    ] + ([format_ci(metrics[metric].ci)] if metric in include_ci_for else [])
                    for metric in include
                ],
                flatten,
                compact,
            ),
        ]

    lines = pipe(
        [get_line(method, metrics) for method, metrics in metrics_for_methods.items()],
        partial(sorted, key=get(1), reverse=True),
    )

    return format_structure(
        format_decimal,
        [
            [
                '', *flatten(
                    map(
                        lambda metric:
                        [format_metric_short(metric), *(['Δ'] if include_delta else [])] +
                        (['95% CI'] if metric in include_ci_for else []), include
                    )
                )
            ],
            *lines,
        ],
    )
Пример #10
0
 def run(self):
     gather_metadata = pipe(
         text_files, list,
         partial(LuigiTqdm, task=self),
         progress_bar('gathering metadata'),
         for_each(spectrum_metadata, lazy=False),
         np.vstack,
         partial(pd.DataFrame, columns=['R', 'X', 'Y'])
     )
     data_path = os.path.join(self.INPUT_DIR, self.dataset)
     metadata = gather_metadata(data_path)
     with self.output().open('w') as outfile:
         save_csv(metadata, outfile)
Пример #11
0
def structure_feature_importance(series: Series) -> List[FeatureImportanceItem]:
    return pipe(
        series,
        pandas.Series.items,
        partial(
            mapl,
            decorate_unpack(
                lambda feature, importance: {
                    'feature': feature,
                    'importance': importance
                }
            )
        ),
    )
Пример #12
0
    def get_merged_roc_point(_roc_curves: List[Tuple[np.array, np.array,
                                                     np.array]],
                             threshold: float) -> Tuple[float, float]:
        if threshold > 1:
            threshold = 1

        merged_fpr, merged_tpr = pipe(
            _roc_curves,
            map(lambda curve: get_roc_point_by_threshold(threshold, *curve)),
            list,
            partial(np.mean, axis=0),
        )

        return merged_fpr, merged_tpr
Пример #13
0
def count_values_and_align(series1: Series, series2: Series) -> Tuple[Series, Series]:
    values1 = series1.copy().value_counts().sort_index()
    values2 = series2.copy().value_counts().sort_index()
    indexes = pipe(
        set(values1.index).union(set(values2.index)),
        list,
        sorted,
    )
    for values in (values1, values2):
        for index in indexes:
            try:
                values.loc[index]
            except KeyError:
                values.loc[index] = 0

    return values1.sort_index(), values2.sort_index()
Пример #14
0
 def get_line(
     method: str, metrics: Union[ClassificationMetrics, ClassificationMetricsWithStatistics]
 ):
     return [
         format_method_name(method),
         *pipe(
             [
                 [
                     metrics[metric].mean,
                     (
                         metrics[metric].mean -
                         get_max_metric_value(metric, metrics_for_methods.values())
                     ) if include_delta else None,
                 ] + ([format_ci(metrics[metric].ci)] if metric in include_ci_for else [])
                 for metric in include
             ],
             flatten,
             compact,
         ),
     ]
def stock_history(init_investment, monthly_contribution, stock_prices):
    stock_prices = F.map(stock_prices,
                         F.pipe(lambda x: x.split(" "),
                                lambda x: [int(a) for a in x]))

    sell_prices = stock_prices[-1]

    total_earning = 0
    total_investment = 0
    max_earning_rate = [1 for _ in sell_prices]

    for month in range(len(stock_prices) -2, -1, -1):
        current_prices = stock_prices[month]
        earning_rate = [sell_prices[i] / price for i, price in enumerate(current_prices)]
        max_earning_rate = [max(a, b) for a, b in zip(max_earning_rate, earning_rate)]

        money = init_investment if month == 0 else monthly_contribution
        total_earning += money * max(max_earning_rate)
        total_investment += money

    return int(round(total_earning - total_investment))
Пример #16
0
def get_cluster_mapping_by_prevalence(
    y_pred: Series,
    y_true: Series,
) -> Dict:

    def get_1_prevalence(distribution: Series) -> int:
        try:
            prevalence_1 = (distribution[1] / distribution.sum())
        except KeyError:
            prevalence_1 = 0
        return prevalence_1

    return pipe(
        get_counts_per_cluster(y_pred, y_true),
        dict.items,
        partial(map_tuples, lambda index, distribution: (index, get_1_prevalence(distribution))),
        sorted(key=get(1)),
        enumerate,
        partial(map_tuples, lambda index, item: (item[0], index)),
        list,
        from_items,
    )
Пример #17
0
def spectrum_sampling_pipe(mzs):
    return pipe(try_loadtxt, np.transpose,
                as_arguments_of(partial(np.interp, mzs)), np.ravel,
                partial(np.ndarray.astype, dtype=np.float32))
)
import luigi
import numpy as np

from components.io_utils import text_files, try_loadtxt
from components.spectrum.resampling import estimate_new_axis
from pipeline._base import *


def get_mzs_from_content(content: np.ndarray) -> np.ndarray:
    return content[:, 0]


get_mz_axis = pipe(
    text_files,
    partial(take, n_elements=1),
    as_arguments_of(try_loadtxt),
    get_mzs_from_content
)


mz_range = pipe(
    report_value('dataset'),
    get_mz_axis,
    broadcast(np.min, np.max),
    np.array,
    report_value('mz range')
)


def common_part(first, second):
    return max(first[0], second[0]), min(first[1], second[1])
Пример #19
0
 def test_pipe(self):
     func_pipe = F.pipe(
         F.curryr(F.map)(lambda x: x + 1),
         F.curryr(F.map)(lambda x: x * 2))
     self.assertEqual(func_pipe(_list), [4, 6, 8, 10])
Пример #20
0
import os
from functools import partial

from functional import pipe
import numpy as np
import pandas as pd


def rooted_content(root: str):
    return [os.path.join(root, element) for element in os.listdir(root)]


subdirectories = pipe(rooted_content, partial(filter, os.path.isdir))
files = pipe(rooted_content, partial(filter, os.path.isfile))


def has_extension(path, extension: str='.txt'):
    return os.path.splitext(path)[1].lower() == extension


is_text = partial(has_extension, extension='.txt')
text_files = pipe(files, partial(filter, is_text))


def try_loadtxt(fname: str):
    try:
        return np.loadtxt(fname)
    except ValueError:
        return pd.read_csv(fname, delimiter=' ', header=None,
                           decimal=',').values
Пример #21
0
def format_style(style: Dict) -> str:
    return pipe(
        (f'{key}: {value}' for key, value in style.items()),
        lambda _styles: ";".join(_styles),
    )
Пример #22
0
def get_datasets(
    base: DataFrame,
    impute: bool = True,
) -> Dict[str, DataFrame]:
    label = 'ACV2'
    data = pipe(
        base,
        format_columns,
        partial(
            select_features,
            features=(
                'RWT', 'EM', 'LVMI', 'IVSD', 'PP', 'GS', 'LA_Adi', 'SBP', 'AM', 'LVPWD', 'MVE_VEL',
                'LVIDD', 'LA_GS', 'RMVEA', 'PR', 'SM', 'LA_Asi', 'REEM', 'REAM', 'IVRT', 'LAEDVi',
                'LAESVi', 'MVA_VEL', 'AO_DIAM', 'EF_MOD', 'LA_A_4ch', 'MV_DECT', 'ESV_MODI',
                'LA_EF_4ch', 'SV_MODI'
            )
        ),
        impute_missing if impute else identity,
    )
    data = DataFrame(
        StandardScaler().fit_transform(data),
        columns=data.columns,
        index=data.index,
    )
    return dict(
        base=base,
        clustering=data,
        clustering_correlated_removed=select_features(
            data, (
                'RWT', 'EM', 'LVMI', 'GS', 'SBP', 'AM', 'LVPWD', 'MVE_VEL', 'LVIDD', 'LA_GS',
                'RMVEA', 'PR', 'SM', 'REEM', 'IVRT', 'LAESVi', 'MVA_VEL', 'AO_DIAM', 'EF_MOD',
                'MV_DECT', 'ESV_MODI', 'LA_EF_4ch', 'SV_MODI'
            )
        ),
        varsellcm=select_features(
            data, (
                'RWT', 'EM', 'LVMI', 'GS', 'SBP', 'AM', 'LVPWD', 'MVE_VEL', 'LVIDD', 'LA_GS',
                'RMVEA', 'SM', 'REEM', 'IVRT', 'LAESVi', 'MVA_VEL', 'AO_DIAM', 'EF_MOD', 'MV_DECT',
                'ESV_MODI', 'LA_EF_4ch', 'SV_MODI'
            )
        ),
        manual=select_features(
            data_all, (
                'SBP', 'PP', 'LVMI', 'PR', 'REEM', 'ESV_MODI', 'LAESVI', 'LA_GS', 'MVE_VEL',
                'MVA_VEL', 'RMVEA', 'AM', 'EM', 'GS', 'MV_DECT'
            )
        ),
        feature_selection_subsets={
            'manual': [
                'SBP', 'PP', 'LVMI', 'PR', 'REEM', 'ESV_MODI', 'LAESVI', 'LA_GS', 'MVE_VEL',
                'MVA_VEL', 'RMVEA', 'AM', 'EM', 'GS', 'MV_DECT'
            ],
            'normalized_cut_15': [
                'PR', 'IVRT', 'LVIDD', 'PP', 'AO_DIAM', 'IVSD', 'LVMI', 'ESV_MODI', 'SV_MODI',
                'RWT', 'AM', 'REAM', 'SBP', 'LVPWD', 'SM'
            ],
        },
        varsellcm_importance=[
            ['Variables', 'Discrim. Power', 'Discrim. Power (%)', 'Discrim. Power (% cum)'],
            ['REAM', 817.64, 9.31, 9.31],
            ['LAEDVI', 699.19, 7.96, 17.27],
            ['EM', 629.51, 7.17, 24.44],
            ['LA_ADI', 596.19, 6.79, 31.23],
            ['RMVEA', 582.88, 6.64, 37.87],
            ['REEM', 481.04, 5.48, 43.34],
            ['LAESVI', 414.47, 4.72, 48.06],
            ['LVMI', 345.93, 3.94, 52.00],
            ['LA_ASI', 342.89, 3.90, 55.91],
            ['MVA_VEL', 335.27, 3.82, 59.72],
            ['IVSD', 309.54, 3.52, 63.25],
            ['LA_GS', 309.44, 3.52, 66.77],
            ['SBP', 295.45, 3.36, 70.14],
            ['LA_A_4CH', 284.82, 3.24, 73.38],
            ['LA_EF_4CH', 273.40, 3.11, 76.49],
            ['PP', 262.97, 2.99, 79.49],
            ['MV_DECT', 255.25, 2.91, 82.39],
            ['AM', 253.88, 2.89, 85.28],
            ['LVPWD', 244.71, 2.79, 88.07],
            ['RWT', 233.04, 2.65, 90.72],
            ['AO_DIAM', 162.76, 1.85, 92.58],
            ['MVE_VEL', 156.93, 1.79, 94.36],
            ['IVRT', 142.54, 1.62, 95.99],
            ['SM', 136.16, 1.55, 97.54],
            ['ESV_MODI', 114.66, 1.31, 98.84],
            ['LVIDD', 39.61, 0.45, 99.29],
            ['SV_MODI', 28.07, 0.32, 99.61],
            ['PR', 15.19, 0.17, 99.79],
            ['GS', 13.58, 0.15, 99.94],
            ['EF_MOD', 5.15, 0.06, 100.00],
        ],
        varsellcm_importance_k_2=[
            ['Variables', 'Discrim. Power', 'Discrim. Power (%)', 'Discrim. Power (% cum)'],
            ['REAM', 643.22, 9.79, 9.79],
            ['EM', 557.63, 8.49, 18.28],
            ['RMVEA', 428.47, 6.52, 24.81],
            ['LAEDVI', 396.74, 6.04, 30.85],
            ['REEM', 385.05, 5.86, 36.71],
            ['LA_ADI', 337.76, 5.14, 41.85],
            ['LA_GS', 293.81, 4.47, 46.33],
            ['IVSD', 282.91, 4.31, 50.63],
            ['MVA_VEL', 265.06, 4.04, 54.67],
            ['SBP', 251.84, 3.83, 58.50],
            ['MV_DECT', 251.50, 3.83, 62.33],
            ['LVMI', 249.36, 3.80, 66.13],
            ['LVPWD', 238.89, 3.64, 69.77],
            ['RWT', 232.57, 3.54, 73.31],
            ['PP', 231.99, 3.53, 76.84],
            ['LAESVI', 209.58, 3.19, 80.03],
            ['LA_A_4CH', 201.95, 3.07, 83.11],
            ['LA_EF_4CH', 185.32, 2.82, 85.93],
            ['AO_DIAM', 183.30, 2.79, 88.72],
            ['LA_ASI', 161.47, 2.46, 91.18],
            ['AM', 149.00, 2.27, 93.45],
            ['IVRT', 129.09, 1.97, 95.41],
            ['MVE_VEL', 128.61, 1.96, 97.37],
            ['SM', 116.14, 1.77, 99.14],
            ['ESV_MODI', 21.25, 0.32, 99.46],
            ['GS', 15.08, 0.23, 99.69],
            ['EF_MOD', 12.04, 0.18, 99.87],
            ['LVIDD', 8.23, 0.13, 100.00],
        ],
        label=label,
        y_true=base[label],
    )
Пример #23
0
from functional import (
    as_arguments_of,
    for_each,
    pipe,
    progress_bar,
)
import numpy as np

from components.io_utils import text_files, try_loadtxt


def spectrum_sampling_pipe(mzs):
    return pipe(try_loadtxt, np.transpose,
                as_arguments_of(partial(np.interp, mzs)), np.ravel,
                partial(np.ndarray.astype, dtype=np.float32))


if __name__ == '__main__':
    DATASET = sys.argv[1]
    MZ_AXIS = sys.argv[2]
    DESTINATION = sys.argv[3]
    new_axis = np.loadtxt(MZ_AXIS, delimiter=',')
    resampled = pipe(
        text_files, list, progress_bar('resampling dataset'),
        for_each(spectrum_sampling_pipe(new_axis),
                 parallel=True,
                 chunksize=800))(DATASET)
    with open(DESTINATION, 'wb') as outfile:
        np.save(outfile, resampled)
Пример #24
0
import os
from functools import partial

from functional import pipe

only_directories = partial(filter, os.path.isdir)


def rooted_listdir(root):
    return [os.path.join(root, path) for path in os.listdir(root)]


subdirectories = pipe(rooted_listdir, only_directories, list)
Пример #25
0

def get_component(filename, component):
    match = metadata_pattern.search(filename)
    if match is None:
        raise ValueError(filename)
    metadata = filename[match.start() + 1:match.end() - 1]
    metadata = metadata[metadata.find(component) + 1:]
    metadata = metadata[:component_length(metadata)]
    return int(metadata)


spectrum_metadata = pipe(
    broadcast(
        partial(get_component, component='R'),
        partial(get_component, component='X'),
        partial(get_component, component='Y')
    ),
    np.array
)


class AssembleMetadata(BaseTask):
    INPUT_DIR = os.path.join(BaseTask.INPUT_DIR, 'raw')
    OUTPUT_DIR = os.path.join(BaseTask.OUTPUT_DIR, 'metadata')

    dataset = luigi.Parameter(description='Dataset to get metadata from')

    def output(self):
        return self._as_target("{0}.csv".format(self.dataset))

    def run(self):