Пример #1
0
def setup_local_results(exp_time):

    log_root = os.path.join(project_root(), 'data', 'logs', exp_time)
    os.mkdir(log_root)
    logging.basicConfig(filename=os.path.join(log_root, exp_time + '.log'),
                        level=logging.DEBUG)
    shutil.copy(os.path.join(project_root(), 'pytorch_classifier.py'),
                log_root)
    shutil.copy(os.path.join(project_root(), 'train.py'), log_root)

    return log_root
Пример #2
0
def initialize_local_experiment():

    exp_time = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
    results_path = setup_local_results(exp_time)

    data_name = 'training_filled.pickle'
    log(message="Data name: {}", value=data_name)

    training_examples = pd.read_pickle(
        os.path.join(project_root(), 'data', 'processed', data_name))
    with open(os.path.join(project_root(), 'data', 'processed',
                           'lengths.txt')) as f:
        lengths_list = [int(l) for l in f.read().splitlines()]
    with open(
            os.path.join(project_root(), 'data', 'processed',
                         'is_sepsis.txt')) as f:
        is_sepsis = [int(l) for l in f.read().splitlines()]
    writer = SummaryWriter(log_dir=os.path.join(project_root(), 'data', 'logs',
                                                exp_time),
                           comment='')

    return training_examples, lengths_list, is_sepsis, writer, results_path
Пример #3
0
def calculate_sepsis():
    # occ = 0
    # for training_example in tqdm.tqdm(training_examples):
    #     if 1 in training_example['SepsisLabel'].values:
    #         occ += 1
    # print(occ)

    data_path = os.path.join(project_root(), 'data', 'processed', 'training')

    training_files = [os.path.join(data_path, f) for f in os.listdir(data_path) if f.endswith('.csv')]
    training_files.sort()
    for training_file in tqdm.tqdm(training_files):
        training_example = pd.read_csv(training_file, sep='|')
        if 1 in training_example['SepsisLabel'].values:
            print(training_file)
def rewrite_data_to_better_formats(training_files):

    lengths = []
    is_sepsis = []
    all_data = np.zeros((1552210, 42))
    ind = 0
    training_examples = []
    for i, training_file in enumerate(tqdm.tqdm(training_files)):
        example = pd.read_csv(training_file, sep=',')
        example['seg_id'] = i
        training_examples.append(example)
        is_sepsis.append(1 if 1 in example['SepsisLabel'].values else 0)

        lengths.append(len(example))

        all_data[ind:ind + len(example), :] = example.values
        ind += len(example)
    all_data = pd.DataFrame(all_data,
                            columns=example.columns.values,
                            index=None)

    all_data.to_hdf(os.path.join(project_root(), 'data', 'processed',
                                 'training_concatenated.hdf'),
                    key='df')
    all_data.to_csv(os.path.join(project_root(), 'data', 'processed',
                                 'training_concatenated.csv'),
                    index=False)
    ss = sklearn.preprocessing.StandardScaler()
    all_data = pd.DataFrame(ss.fit_transform(all_data),
                            columns=all_data.columns.values)

    with open(os.path.join(project_root(), 'data', 'processed', 'lengths.txt'),
              'w') as f:
        [f.write('{}\n'.format(l)) for l in lengths]
    with open(
            os.path.join(project_root(), 'data', 'processed', 'is_sepsis.txt'),
            'w') as f:
        [f.write('{}\n'.format(l)) for l in is_sepsis]

    with open(
            os.path.join(project_root(), 'data', 'processed',
                         'training_raw.pickle'), 'wb') as f:
        pickle.dump(training_examples, f)

    training_examples = []
    for training_file in tqdm.tqdm(training_files):
        example = pd.read_csv(training_file, sep=',')
        example.ffill(inplace=True)
        example.bfill(inplace=True)
        example.fillna(0, inplace=True)
        training_examples.append(example)

    with open(
            os.path.join(project_root(), 'data', 'processed',
                         'training_filled.pickle'), 'wb') as f:
        pickle.dump(training_examples, f)
Пример #5
0
def check_if_exists_empty_after_non_empty(training_examples):
    occured = False
    for training_example in tqdm.tqdm(training_examples):

        training_example.fillna(0, inplace=True)
        if 1 in training_example['SepsisLabel'].values:
            is_sepsis = False
            for i in training_example['SepsisLabel'].values:
                if i == 1:
                    is_sepsis = True
                if is_sepsis and i == 0:

                    plt.figure()
                    plt.plot(training_example['SepsisLabel'], c="g")
                    plt.show(block=False)
                    occured = True
    if occured:
        print('Occured')


if __name__ == '__main__':

    with open(os.path.join(project_root(), 'data', 'processed', 'training.pickle'), 'rb') as fp:
        training_examples = pickle.load(fp)

    # plot_sepsis_label(training_examples)
    # plot_length_hist(training_examples)
    # check_if_exists_empty_after_non_empty(training_examples)
    calculate_sepsis()
    plot_start_sepsis_hist(training_examples)
Пример #6
0
import os

from sklearn.manifold import TSNE
import plotly.express as px
import pandas as pd

from utils.path_utils import project_root

if __name__ == '__main__':
    path = os.path.join(project_root(), 'data', 'processed',
                        'librispeech-gender-feats-test-clean.csv')
    data = pd.read_csv(path)
    columns = [
        'mean', 'std', 'median', 'kurt', 'skew', 'p25', 'p75', 'iqr', 'ent',
        'meanfun', 'maxfun', 'minfun'
    ]

    data = data.dropna()

    perplexity = 20
    tsne = TSNE(n_components=2, perplexity=perplexity, random_state=42)
    feats2d = tsne.fit_transform(data[columns].values)

    data['x'] = feats2d[:, 0]
    data['y'] = feats2d[:, 1]
    fig = px.scatter(data,
                     x='x',
                     y='y',
                     title=f'TSNE 2d projection of data<br>{path}',
                     color="label",
                     hover_data=['path'])
import pandas as pd
import numpy as np
import os
import pickle
import tqdm
from utils.path_utils import project_root

data_path = os.path.join(project_root(), 'data', 'raw', 'training')
data_path2 = os.path.join(project_root(), 'data', 'processed', 'training')

training_examples = []
training_files = [f for f in os.listdir(data_path) if f.endswith('.psv')]
training_files.sort()

for training_file in tqdm.tqdm(training_files):
    example = pd.read_csv(os.path.join(data_path, training_file), sep='|')
    example.to_csv(os.path.join(data_path2, training_file[:-4] + '.csv'),
                   sep=',',
                   index=False,
                   header=example.columns)
import os

from sklearn.manifold import TSNE
import plotly.express as px
import pandas as pd

from utils.path_utils import project_root


if __name__ == '__main__':
    path = os.path.join(project_root(), 'data', 'processed', 'librispeech-gender-feats-test-clean.csv')
    data = pd.read_csv(path)
    columns = ['mean', 'std', 'median', 'kurt', 'skew', 'p25', 'p75', 'iqr', 'ent']

    data = data.dropna()

    perplexity = 20
    tsne = TSNE(n_components=2, perplexity=perplexity, random_state=42)
    feats2d = tsne.fit_transform(data[columns].values)

    data['x'] = feats2d[:, 0]
    data['y'] = feats2d[:, 1]
    fig = px.scatter(data, x='x', y='y', title=f'TSNE 2d projection of data<br>{path}', color="label", hover_data=['path']);fig.show()
    fig.write_html(os.path.join(project_root(), "data", "processed", f"2d_{perplexity}.html"))
Пример #9
0
import os
import pandas as pd
from utils.path_utils import project_root
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score

if __name__ == '__main__':

    train_data = pd.read_csv(
        os.path.join(project_root(), 'data', 'processed',
                     'librispeech-gender-feats-train-clean-100.csv'))

    dev_data = pd.read_csv(
        os.path.join(project_root(), 'data', 'processed',
                     'librispeech-gender-feats-dev-clean.csv'))

    train_data = train_data.dropna()
    dev_data = dev_data.dropna()

    columns = [
        'mean', 'std', 'median', 'kurt', 'skew', 'p25', 'p75', 'iqr', 'ent'
    ]

    train_y = train_data['label']

    le = LabelEncoder()
    le.fit(train_data['label'])
Пример #10
0
    ],
}

aggs.update(aggs_clinical)
aggs.update(aggs_vital)


# rolling
# expandingx
def agg_feats(df):

    return None


def extract_features(df):
    df = agg_feats(df)
    return df


if __name__ == '__main__':
    data_name = 'training_concatenated.hdf'
    training_examples_path = os.path.join(project_root(), 'data', 'processed',
                                          data_name)
    df = pd.read_hdf(training_examples_path, key='df')
    df = df.iloc[:1000, :]

    df = extract_features(df)
    df.to_csv(
        os.path.join(project_root(), 'data', 'processed',
                     'training_features.csv'))
import os
import pandas as pd
from utils.path_utils import project_root
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN, KMeans

from sklearn.manifold import TSNE
import plotly.express as px


if __name__ == '__main__':

    train_data = pd.read_csv(os.path.join(project_root(), 'data', 'processed',
                                          'librispeech-gender-feats-train-clean-100.csv'))

    dev_data = pd.read_csv(os.path.join(project_root(), 'data', 'processed',
                                        'librispeech-gender-feats-dev-clean.csv'))

    data = dev_data.dropna()

    columns = ['mean', 'std', 'median', 'kurt', 'skew', 'p25', 'p75', 'iqr', 'ent', 'meanfun', 'maxfun', 'minfun']
    # columns = ['iqr', 'meanfun']


    for i in range(7, len(columns)-1):
        for j in range(i+1, len(columns)):

            tmp_columns = [columns[i], columns[j]]
            values = data[tmp_columns].values

            scaler = StandardScaler()
Пример #12
0
                                               35] - training_example.values[
                                                   6:, :35]

        training_example = pd.concat(
            [training_example,
             pd.DataFrame(columns=lag_columns)])
        training_example.loc[6:, lag_columns] = lag_features

        # training_example.ffill(inplace=True)
        # training_example.bfill(inplace=True)
        # training_example.fillna(0, inplace=True)

        training_examples_lag.append(training_example)

    return training_examples_lag


if __name__ == '__main__':

    data_name = 'training_filled.pickle'

    training_examples = pd.read_pickle(
        os.path.join(project_root(), 'data', 'processed', data_name))

    training_examples = add_lag_features(training_examples)

    with open(
            os.path.join(project_root(), 'data', 'processed',
                         'training_filled_lag.pickle'), 'wb') as f:
        pickle.dump(training_examples, f)
    std = bin_means.std()
    median = np.median(bin_means)
    kurt = stats.kurtosis(bin_means)
    skew = stats.skew(bin_means)
    p25 = np.percentile(bin_means, 25)
    p75 = np.percentile(bin_means, 75)
    iqr = p75 - p25
    ent = stats.entropy(bin_means)

    return mean, std, median, kurt, skew, p25, p75, iqr, ent


if __name__ == '__main__':
    chosen_set = 'train-clean-100'

    raw_data_root = os.path.join(project_root(), 'data', 'raw', 'LibriSpeech')
    speakers_filepath = os.path.join(raw_data_root, 'SPEAKERS.TXT')

    results_filepath = os.path.join(project_root(), 'data', 'processed', f'librispeech-gender-feats-{chosen_set}.csv')

    audio_paths, labels = get_librispeech_paths(raw_data_root, speakers_filepath, contains=chosen_set)

    tq = tqdm.tqdm(enumerate((zip(audio_paths, labels))), total=len(audio_paths))
    feats_rows = []
    for i, (path, label) in tq:
        audio, fs = sf.read(path)
        row = extract_features(audio, fs)
        feats_rows.append(row)

    results = pd.DataFrame(feats_rows, columns=['mean', 'std', 'median', 'kurt', 'skew', 'p25', 'p75', 'iqr', 'ent'])
    results['path'] = ['/'.join(p.split('/')[-4:]) for p in audio_paths]
    duration = len(audio) / fs

    return mean, std, median, kurt, skew, p25, p75, iqr, ent, meanfun, maxfun, minfun, duration


def extract_speaker_id(rec_path):
    return rec_path.split(os.sep)[-3]


if __name__ == '__main__':
    # chosen_set = 'train-clean-100'
    # chosen_set = 'test-clean'
    chosen_set = 'dev-clean'

    if os.path.isdir(os.path.join(project_root(), 'data', 'raw',
                                  'LibriSpeech')):
        raw_data_root = os.path.join(project_root(), 'data', 'raw',
                                     'LibriSpeech')
    else:
        raw_data_root = os.path.join(project_root(), 'data', 'raw', chosen_set,
                                     'LibriSpeech')
    speakers_filepath = os.path.join(raw_data_root, 'SPEAKERS.TXT')

    results_filepath = os.path.join(
        project_root(), 'data', 'processed',
        f'librispeech-gender-feats-{chosen_set}.csv')

    audio_paths, labels = get_librispeech_paths(raw_data_root,
                                                speakers_filepath,
                                                contains=chosen_set)