def _transform(self, dfs):
        icustay_index = dfs['icustay_detail'].set_index('icustay_id')

        for df_name in ['chartevents', 'labevents', 'ioevents']:
            df = dfs[df_name]
            df['icustay_intime'] = icustay_index.loc[df['icustay_id'],
                                                     'icustay_intime'].values
            df = df[df['charttime'] > df['icustay_intime']]
            dfs[df_name] = df

        demo_history = DemographicTransform().transform(dfs['icustay_detail'])
        chart_history = ChartTransform().transform(dfs['chartevents'])
        lab_history = LabTransform().transform(dfs['labevents'])
        urine_history = UrineTransform().transform(dfs['ioevents'])

        all_history = from_chunks(
            [chart_history, lab_history, demo_history, urine_history])

        all_history['charttime'] = big_dt_to_num(all_history['charttime'])
        all_history = all_history.sort_index(
            by=['subject_id', 'charttime']).reset_index(drop=True)
        all_history['charttime'] = num_to_big_dt(all_history['charttime'])

        all_history.drop_duplicates(inplace=True)
        all_history = all_history[[
            'subject_id', 'charttime', 'category', 'valuenum'
        ]]

        return all_history
    def _transform(self, df):
        df['days'] = df['end_date'] - df['start_date'] + pd.Timedelta(days=1)

        invalid_range_mask = df[
            'days'] < 1  #remove rows with invalid (negative) deltas
        df = df[~invalid_range_mask]

        max_days = ceil(df['days'].max().days)
        df['date'] = df['start_date']
        del df['start_date'], df['end_date']

        expanded = [df]
        expand_per_row = False
        for i in range(2, max_days):
            prev = expanded[-1]
            rows_as_long_as = prev[prev['days'] >= pd.Timedelta(days=i)]
            rows_as_long_as.loc[:, 'date'] += pd.Timedelta(days=1)

            # if there are only a relative few rows remaining with long date ranges,
            # it will be more efficient to expand them per-row rather than per-date-range
            if len(rows_as_long_as) < max_days / len(
                    rows_as_long_as.columns)**2:
                expand_per_row = True
                break
            expanded.append(rows_as_long_as)

        #expand remaining rows per-row rather than per-date-range
        if expand_per_row:
            rala = rows_as_long_as
            # represent any categorical columns as ints for fast expansion
            rala_cat = rala.select_dtypes(include=['category'])
            rala_categories = {}
            for col in rala_cat.columns:
                rala_categories[col] = rala[col].cat.categories
                rala[col] = rala[col].cat.codes

            # for each row, generate expanded rows as remaining days in the date range
            for j, row in rala.iterrows():
                date = row['date']
                n_days = row['days'].days - i
                expanded_row = pd.DataFrame(index=np.arange(n_days + 1),
                                            columns=row.index)
                for col in expanded_row.columns:
                    expanded_row[col] = row[col]
                expanded_row['date'] = pd.date_range(
                    date, date + pd.Timedelta(days=n_days))

                #convert numeric back to categorical
                for col, categories in rala_categories.items():
                    expanded_row[col] = pd.Categorical.from_codes(
                        codes=expanded_row[col],
                        categories=categories,
                        name=col)
                expanded.append(expanded_row)

        # merge all extra rows
        df = from_chunks(expanded)
        return df
    def _transform(self, df):
        df['days'] = df['end_date'] - df['start_date'] + pd.Timedelta(days=1)
        
        invalid_range_mask = df['days'] < 1 #remove rows with invalid (negative) deltas
        df = df[~invalid_range_mask]
        
        max_days = ceil(df['days'].max().days)
        df['date'] = df['start_date']
        del df['start_date'], df['end_date']

        expanded = [df]
        expand_per_row = False
        for i in range(2, max_days):
            prev = expanded[-1]
            rows_as_long_as = prev[prev['days'] >= pd.Timedelta(days=i)]
            rows_as_long_as.loc[:, 'date'] += pd.Timedelta(days=1)

            # if there are only a relative few rows remaining with long date ranges,
            # it will be more efficient to expand them per-row rather than per-date-range
            if len(rows_as_long_as) < max_days / len(rows_as_long_as.columns)**2:
                expand_per_row = True
                break
            expanded.append(rows_as_long_as)

        #expand remaining rows per-row rather than per-date-range
        if expand_per_row:
            rala = rows_as_long_as
            # represent any categorical columns as ints for fast expansion
            rala_cat = rala.select_dtypes(include=['category'])
            rala_categories = {}
            for col in rala_cat.columns:
                rala_categories[col] = rala[col].cat.categories
                rala[col] = rala[col].cat.codes

            # for each row, generate expanded rows as remaining days in the date range
            for j, row in rala.iterrows():
                date = row['date']
                n_days = row['days'].days - i
                expanded_row = pd.DataFrame(index=np.arange(n_days + 1), columns=row.index)
                for col in expanded_row.columns:
                    expanded_row[col] = row[col]
                expanded_row['date'] = pd.date_range(date, date+pd.Timedelta(days=n_days))
                
                #convert numeric back to categorical
                for col, categories in rala_categories.items():
                    expanded_row[col] = pd.Categorical.from_codes(
                        codes=expanded_row[col],
                        categories=categories,
                        name=col
                    )
                expanded.append(expanded_row)

        # merge all extra rows
        df = from_chunks(expanded)
        return df
    def _transform(self, data):
        start = time.time()
        print('transforming data of size',
              data.memory_usage(index=True).sum(), 'bytes')

        store_chunks_jobs = []
        transform_jobs = []
        hdf_stores = []
        try:
            print('splitting data into large groups')
            group_iter = self._group_iter(
                data,
                len(data) // self.n_jobs or self.chunksize)

            for group_data in group_iter:
                if group_data.empty:
                    continue
                f = temp_file.make_temporary_file()
                hdf_store = HdfDataStore(self.input_schema(), f)
                hdf_stores.append(hdf_store)
                hdf_store.store(group_data)

                store_chunks_jobs.append(
                    joblib.delayed(self.store_chunks_job)(hdf_store))
                #transform_jobs.append(joblib.delayed(self.transform_job)(hdf_store))
            print('breaking data into chunks in parallel')
            joblib.Parallel(n_jobs=self.n_jobs)(store_chunks_jobs)

            chunk_stores = chain.from_iterable(store.chunk_stores()
                                               for store in hdf_stores)

            transform_jobs = [
                joblib.delayed(self.transform_job)(chunk_store)
                for chunk_store in chunk_stores
            ]

            print('running transforms in', len(transform_jobs),
                  'parallel jobs')
            result_hdf_stores = joblib.Parallel(
                n_jobs=self.n_jobs)(transform_jobs)

            print('loading and merging the results')
            results = from_chunks(r_hdf_store.load()
                                  for r_hdf_store in result_hdf_stores)
            print('finished merge')
        finally:
            for hdf_store in hdf_stores:
                hdf_store.delete_chunks()
                hdf_store.delete()

        end = time.time()
        print('took', end - start, 'seconds to transform all data in parallel')
        return results
    def _transform(self, tables):
        # filtering carevue ioevents
        io_carevue = VasoIOEventsCarevue().transform(tables)
        
        # filtering metavision ioevents
        io_metavision = VasoIOEventsMetavision().transform(tables)
        
        # concatenating ioevents
        ioevents = from_chunks([io_carevue, io_metavision])

        # getting vaso day counts
        vaso_days = VasoDayCounts().transform(ioevents)

        return vaso_days
    def _transform(self, tables):
        # filtering carevue ioevents
        io_carevue = VasoIOEventsCarevue().transform(tables)

        # filtering metavision ioevents
        io_metavision = VasoIOEventsMetavision().transform(tables)

        # concatenating ioevents
        ioevents = from_chunks([io_carevue, io_metavision])

        # getting vaso day counts
        vaso_days = VasoDayCounts().transform(ioevents)

        return vaso_days
    def load(self):
        seq = odo.odo(self.odo_target, odo.chunks(pandas.DataFrame),
            chunksize=CHUNK_SIZE,
            dshape=schema_to_dshape(self.schema))
        
        def conv_chunks(chunks):
            for chunk in chunks:
                print('typechecking a chunk')
                self.schema.conform_df(chunk, skip_sort=True)
                yield chunk

        print('concatenating df chunks')
        df = from_chunks(conv_chunks(seq))
        
        return df
    def load(self):
        seq = odo.odo(self.odo_target,
                      odo.chunks(pandas.DataFrame),
                      chunksize=CHUNK_SIZE,
                      dshape=schema_to_dshape(self.schema))

        def conv_chunks(chunks):
            for chunk in chunks:
                print('typechecking a chunk')
                self.schema.conform_df(chunk, skip_sort=True)
                yield chunk

        print('concatenating df chunks')
        df = from_chunks(conv_chunks(seq))

        return df
    def _transform(self, data):
        start = time.time()
        print("transforming data of size", data.memory_usage(index=True).sum(), "bytes")

        store_chunks_jobs = []
        transform_jobs = []
        hdf_stores = []
        try:
            print("splitting data into large groups")
            group_iter = self._group_iter(data, len(data) // self.n_jobs or self.chunksize)

            for group_data in group_iter:
                if group_data.empty:
                    continue
                f = temp_file.make_temporary_file()
                hdf_store = HdfDataStore(self.input_schema(), f)
                hdf_stores.append(hdf_store)
                hdf_store.store(group_data)

                store_chunks_jobs.append(joblib.delayed(self.store_chunks_job)(hdf_store))
                # transform_jobs.append(joblib.delayed(self.transform_job)(hdf_store))
            print("breaking data into chunks in parallel")
            joblib.Parallel(n_jobs=self.n_jobs)(store_chunks_jobs)

            chunk_stores = chain.from_iterable(store.chunk_stores() for store in hdf_stores)

            transform_jobs = [joblib.delayed(self.transform_job)(chunk_store) for chunk_store in chunk_stores]

            print("running transforms in", len(transform_jobs), "parallel jobs")
            result_hdf_stores = joblib.Parallel(n_jobs=self.n_jobs)(transform_jobs)

            print("loading and merging the results")
            results = from_chunks(r_hdf_store.load() for r_hdf_store in result_hdf_stores)
            print("finished merge")
        finally:
            for hdf_store in hdf_stores:
                hdf_store.delete_chunks()
                hdf_store.delete()

        end = time.time()
        print("took", end - start, "seconds to transform all data in parallel")
        return results
    def _transform(self, dfs):
        icustay_index = dfs['icustay_detail'].set_index('icustay_id')

        for df_name in ['chartevents', 'labevents', 'ioevents']:
            df = dfs[df_name]
            df['icustay_intime'] = icustay_index.loc[df['icustay_id'], 'icustay_intime'].values
            df = df[df['charttime'] > df['icustay_intime']]
            dfs[df_name] = df

        demo_history = DemographicTransform().transform(dfs['icustay_detail'])
        chart_history = ChartTransform().transform(dfs['chartevents'])
        lab_history = LabTransform().transform(dfs['labevents'])
        urine_history = UrineTransform().transform(dfs['ioevents'])

        all_history = from_chunks([chart_history, lab_history, demo_history, urine_history])
        
        all_history['charttime'] = big_dt_to_num(all_history['charttime'])
        all_history = all_history.sort_index(by=['subject_id', 'charttime']).reset_index(drop=True)
        all_history['charttime'] = num_to_big_dt(all_history['charttime'])

        all_history.drop_duplicates(inplace=True)
        all_history = all_history[['subject_id', 'charttime', 'category', 'valuenum']]
        
        return all_history
    def _transform(self, dfs):
        icustay_index = dfs["icustay_detail"].set_index("icustay_id")

        for df_name in ["chartevents", "labevents", "ioevents"]:
            df = dfs[df_name]
            df["icustay_intime"] = icustay_index.loc[df["icustay_id"], "icustay_intime"].values
            df = df[df["charttime"] > df["icustay_intime"]]
            dfs[df_name] = df

        demo_history = DemographicTransform().transform(dfs["icustay_detail"])
        chart_history = ChartTransform().transform(dfs["chartevents"])
        lab_history = LabTransform().transform(dfs["labevents"])
        urine_history = UrineTransform().transform(dfs["ioevents"])

        all_history = from_chunks([chart_history, lab_history, demo_history, urine_history])

        all_history["charttime"] = big_dt_to_num(all_history["charttime"])
        all_history = all_history.sort_index(by=["subject_id", "charttime"]).reset_index(drop=True)
        all_history["charttime"] = num_to_big_dt(all_history["charttime"])

        all_history.drop_duplicates(inplace=True)
        all_history = all_history[["subject_id", "charttime", "category", "valuenum"]]

        return all_history
Пример #12
0
from chatto_transform.config import config
from chatto_transform.schema.ss.ss_sql_raw_schema import appointments
from chatto_transform.datastores.hdf_datastore import HdfDataStore
from chatto_transform.lib.chunks import from_chunks

import time

ds = HdfDataStore(appointments, config.data_dir + 'test.hdf')

chunks = ds.load_chunks()

start = time.time()
df = from_chunks(chunks)
end = time.time()

print('took', end - start, 'seconds to load and concatenate all data')
Пример #13
0
from chatto_transform.config import config
from chatto_transform.schema.ss.ss_sql_raw_schema import appointments
from chatto_transform.datastores.hdf_datastore import HdfDataStore
from chatto_transform.lib.chunks import from_chunks

import time

ds = HdfDataStore(appointments, config.data_dir+'test.hdf')

chunks = ds.load_chunks()

start = time.time()
df = from_chunks(chunks)
end = time.time()

print('took', end - start, 'seconds to load and concatenate all data')