Пример #1
0
def test_hdf_globbing():
    pytest.importorskip('tables')
    df = pd.DataFrame({'x': ['a', 'b', 'c', 'd'],
                       'y': [1, 2, 3, 4]}, index=[1., 2., 3., 4.])

    with tmpdir() as tdir:
        df.to_hdf(os.path.join(tdir, 'one.h5'), '/foo/data', format='table')
        df.to_hdf(os.path.join(tdir, 'two.h5'), '/bar/data', format='table')
        df.to_hdf(os.path.join(tdir, 'two.h5'), '/foo/data', format='table')

        with dask.set_options(get=dask.get):
            res = dd.read_hdf(os.path.join(tdir, 'one.h5'), '/*/data',
                              chunksize=2)
            assert res.npartitions == 2
            tm.assert_frame_equal(res.compute(), df)

            res = dd.read_hdf(os.path.join(tdir, 'one.h5'), '/*/data',
                              chunksize=2, start=1, stop=3)
            expected = pd.read_hdf(os.path.join(tdir, 'one.h5'), '/foo/data',
                                   start=1, stop=3)
            tm.assert_frame_equal(res.compute(), expected)

            res = dd.read_hdf(os.path.join(tdir, 'two.h5'), '/*/data', chunksize=2)
            assert res.npartitions == 2 + 2
            tm.assert_frame_equal(res.compute(), pd.concat([df] * 2))

            res = dd.read_hdf(os.path.join(tdir, '*.h5'), '/foo/data', chunksize=2)
            assert res.npartitions == 2 + 2
            tm.assert_frame_equal(res.compute(), pd.concat([df] * 2))

            res = dd.read_hdf(os.path.join(tdir, '*.h5'), '/*/data', chunksize=2)
            assert res.npartitions == 2 + 2 + 2
            tm.assert_frame_equal(res.compute(), pd.concat([df] * 3))
Пример #2
0
def test_to_hdf_lock_delays():
    pytest.importorskip('tables')
    df16 = pd.DataFrame({'x': ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p'],
                       'y': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]},
                            index=[1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.])
    a = dd.from_pandas(df16, 16)

    # adding artifichial delays to make sure last tasks finish first
    # that's a way to simulate last tasks finishing last
    def delayed_nop(i):
        if i[1] < 10:
            sleep(0.1*(10-i[1]))
        return i

    # saving to multiple hdf nodes
    with tmpfile() as fn:
        a = a.apply(delayed_nop, axis=1, columns=a.columns)
        a.to_hdf(fn, '/data*')
        out = dd.read_hdf(fn, '/data*')
        eq(df16, out)

    # saving to multiple hdf files
    # adding artifichial delays to make sure last tasks finish first
    with tmpdir() as dn:
        fn = os.path.join(dn, 'data*')
        a = a.apply(delayed_nop, axis=1, columns=a.columns)
        a.to_hdf(fn, '/data')
        out = dd.read_hdf(fn, '/data')
        eq(df16, out)
Пример #3
0
def test_to_hdf_thread():
    pytest.importorskip('tables')
    df = pd.DataFrame({'x': ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p'],
                       'y': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]},
                            index=[1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.])
    a = dd.from_pandas(df, 16)

    # test single file single node
    with tmpfile('h5') as fn:
        a.to_hdf(fn, '/data', get=dask.threaded.get)
        out = pd.read_hdf(fn, '/data')
        eq(df, out)

    # test multiple files single node
    with tmpdir() as dn:
        fn = os.path.join(dn, 'data_*.h5')
        a.to_hdf(fn, '/data', get=dask.threaded.get)
        out = dd.read_hdf(fn, '/data')
        eq(df, out)

    # test single file multiple nodes
    with tmpfile('h5') as fn:
        a.to_hdf(fn, '/data*', get=dask.threaded.get)
        out = dd.read_hdf(fn, '/data*')
        eq(df, out)
Пример #4
0
def test_hdf_globbing():
    pytest.importorskip("tables")
    df = pd.DataFrame({"x": ["a", "b", "c", "d"], "y": [1, 2, 3, 4]}, index=[1.0, 2.0, 3.0, 4.0])

    tdir = tempfile.mkdtemp()
    try:
        df.to_hdf(os.path.join(tdir, "one.h5"), "/foo/data", format="table")
        df.to_hdf(os.path.join(tdir, "two.h5"), "/bar/data", format="table")
        df.to_hdf(os.path.join(tdir, "two.h5"), "/foo/data", format="table")

        res = dd.read_hdf(os.path.join(tdir, "one.h5"), "/*/data", chunksize=2)
        assert res.npartitions == 2
        tm.assert_frame_equal(res.compute(), df)

        res = dd.read_hdf(os.path.join(tdir, "one.h5"), "/*/data", chunksize=2, start=1, stop=3)
        expected = pd.read_hdf(os.path.join(tdir, "one.h5"), "/foo/data", start=1, stop=3)
        tm.assert_frame_equal(res.compute(), expected)

        res = dd.read_hdf(os.path.join(tdir, "two.h5"), "/*/data", chunksize=2)
        assert res.npartitions == 2 + 2
        tm.assert_frame_equal(res.compute(), pd.concat([df] * 2))

        res = dd.read_hdf(os.path.join(tdir, "*.h5"), "/foo/data", chunksize=2)
        assert res.npartitions == 2 + 2
        tm.assert_frame_equal(res.compute(), pd.concat([df] * 2))

        res = dd.read_hdf(os.path.join(tdir, "*.h5"), "/*/data", chunksize=2)
        assert res.npartitions == 2 + 2 + 2
        tm.assert_frame_equal(res.compute(), pd.concat([df] * 3))
    finally:
        shutil.rmtree(tdir)
Пример #5
0
def test_hdf_globbing():
    pytest.importorskip('tables')
    df = pd.DataFrame({'x': ['a', 'b', 'c', 'd'],
                       'y': [1, 2, 3, 4]}, index=[1., 2., 3., 4.])

    tdir = tempfile.mkdtemp()
    try:
        df.to_hdf(os.path.join(tdir, 'one.h5'), '/foo/data', format='table')
        df.to_hdf(os.path.join(tdir, 'two.h5'), '/bar/data', format='table')
        df.to_hdf(os.path.join(tdir, 'two.h5'), '/foo/data', format='table')

        res = dd.read_hdf(os.path.join(tdir, 'one.h5'), '/*/data',
                          chunksize=2)
        assert res.npartitions == 2
        tm.assert_frame_equal(res.compute(), df)

        res = dd.read_hdf(os.path.join(tdir, 'one.h5'), '/*/data',
                          chunksize=2, start=1, stop=3)
        expected = pd.read_hdf(os.path.join(tdir, 'one.h5'), '/foo/data',
                               start=1, stop=3)
        tm.assert_frame_equal(res.compute(), expected)

        res = dd.read_hdf(os.path.join(tdir, 'two.h5'), '/*/data', chunksize=2)
        assert res.npartitions == 2 + 2
        tm.assert_frame_equal(res.compute(), pd.concat([df] * 2))

        res = dd.read_hdf(os.path.join(tdir, '*.h5'), '/foo/data', chunksize=2)
        assert res.npartitions == 2 + 2
        tm.assert_frame_equal(res.compute(), pd.concat([df] * 2))

        res = dd.read_hdf(os.path.join(tdir, '*.h5'), '/*/data', chunksize=2)
        assert res.npartitions == 2 + 2 + 2
        tm.assert_frame_equal(res.compute(), pd.concat([df] *  3))
    finally:
        shutil.rmtree(tdir)
Пример #6
0
def test_read_hdf_multiply_open():
    """Test that we can read from a file that's already opened elsewhere in
    read-only mode."""
    pytest.importorskip('tables')
    df = pd.DataFrame({'x': ['a', 'b', 'c', 'd'],
                       'y': [1, 2, 3, 4]}, index=[1., 2., 3., 4.])
    with tmpfile('h5') as fn:
        df.to_hdf(fn, '/data', format='table')
        with pd.HDFStore(fn, mode='r'):
            dd.read_hdf(fn, '/data', chunksize=2, mode='r')
Пример #7
0
def test_read_hdf(data, compare):
    pytest.importorskip('tables')
    with tmpfile('h5') as fn:
        data.to_hdf(fn, '/data')
        try:
            dd.read_hdf(fn, 'data', chunksize=2, mode='r')
            assert False
        except TypeError as e:
            assert "format='table'" in str(e)

    with tmpfile('h5') as fn:
        data.to_hdf(fn, '/data', format='table')
        a = dd.read_hdf(fn, '/data', chunksize=2, mode='r')
        assert a.npartitions == 2

        compare(a.compute(), data)

        compare(dd.read_hdf(fn, '/data', chunksize=2, start=1, stop=3,
                            mode='r').compute(),
                pd.read_hdf(fn, '/data', start=1, stop=3))

        assert (sorted(dd.read_hdf(fn, '/data', mode='r').dask) ==
                sorted(dd.read_hdf(fn, '/data', mode='r').dask))

    with tmpfile('h5') as fn:
        sorted_data = data.sort_index()
        sorted_data.to_hdf(fn, '/data', format='table')
        a = dd.read_hdf(fn, '/data', chunksize=2, sorted_index=True, mode='r')
        assert a.npartitions == 2

        compare(a.compute(), sorted_data)
Пример #8
0
def test_to_hdf_multiple_files():
    pytest.importorskip('tables')
    df = pd.DataFrame({'x': ['a', 'b', 'c', 'd'],
                       'y': [1, 2, 3, 4]}, index=[1., 2., 3., 4.])
    a = dd.from_pandas(df, 2)
    df16 = pd.DataFrame({'x': ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i',
                               'j', 'k', 'l', 'm', 'n', 'o', 'p'],
                         'y': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
                               15, 16]},
                        index=[1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11.,
                               12., 13., 14., 15., 16.])
    b = dd.from_pandas(df16, 16)

    # saving to multiple files
    with tmpdir() as dn:
        fn = os.path.join(dn, 'data_*.h5')
        a.to_hdf(fn, '/data')
        out = dd.read_hdf(fn, '/data')
        assert_eq(df, out)

    # saving to multiple files making sure order is kept
    with tmpdir() as dn:
        fn = os.path.join(dn, 'data_*.h5')
        b.to_hdf(fn, '/data')
        out = dd.read_hdf(fn, '/data')
        assert_eq(df16, out)

    # saving to multiple files with custom name_function
    with tmpdir() as dn:
        fn = os.path.join(dn, 'data_*.h5')
        a.to_hdf(fn, '/data', name_function=lambda i: 'a' * (i + 1))
        out = dd.read_hdf(fn, '/data')
        assert_eq(df, out)

        out = pd.read_hdf(os.path.join(dn, 'data_a.h5'), '/data')
        tm.assert_frame_equal(out, df.iloc[:2])
        out = pd.read_hdf(os.path.join(dn, 'data_aa.h5'), '/data')
        tm.assert_frame_equal(out, df.iloc[2:])

    # test hdf object
    with tmpfile('h5') as fn:
        with pd.HDFStore(fn) as hdf:
            a.to_hdf(hdf, '/data*')
            out = dd.read_hdf(fn, '/data*')
            assert_eq(df, out)
Пример #9
0
def test_to_hdf_modes_multiple_nodes():
    pytest.importorskip('tables')
    df = pd.DataFrame({'x': ['a', 'b', 'c', 'd'],
                       'y': [1, 2, 3, 4]}, index=[1., 2., 3., 4.])

    # appending a single partition to existing data
    a = dd.from_pandas(df, 1)
    with tmpfile('h5') as fn:
        a.to_hdf(fn, '/data2')
        a.to_hdf(fn, '/data*', mode='a')
        out = dd.read_hdf(fn, '/data*')
        eq(df.append(df), out)

    # overwriting a file with a single partition
    a = dd.from_pandas(df, 1)
    with tmpfile('h5') as fn:
        a.to_hdf(fn, '/data2')
        a.to_hdf(fn, '/data*', mode='w')
        out = dd.read_hdf(fn, '/data*')
        eq(df, out)

    # appending two partitions to existing data
    a = dd.from_pandas(df, 2)
    with tmpfile('h5') as fn:
        a.to_hdf(fn, '/data2')
        a.to_hdf(fn, '/data*', mode='a')
        out = dd.read_hdf(fn, '/data*')
        eq(df.append(df), out)

    # overwriting a file with two partitions
    a = dd.from_pandas(df, 2)
    with tmpfile('h5') as fn:
        a.to_hdf(fn, '/data2')
        a.to_hdf(fn, '/data*', mode='w')
        out = dd.read_hdf(fn, '/data*')
        eq(df, out)

    # overwriting a single partition, keeping other partitions
    a = dd.from_pandas(df, 2)
    with tmpfile('h5') as fn:
        a.to_hdf(fn, '/data1')
        a.to_hdf(fn, '/data2')
        a.to_hdf(fn, '/data*', mode='a', append=False)
        out = dd.read_hdf(fn, '/data*')
        eq(df.append(df), out)
Пример #10
0
def test_read_hdf_doesnt_segfault():
    with tmpfile("h5") as fn:
        N = 40
        df = pd.DataFrame(np.random.randn(N, 3))
        with pd.HDFStore(fn, mode="w") as store:
            store.append("/x", df)

        ddf = dd.read_hdf(fn, "/x", chunksize=2)
        assert len(ddf) == N
Пример #11
0
def test_read_hdf_doesnt_segfault():
    pytest.importorskip('tables')
    with tmpfile('h5') as fn:
        N = 40
        df = pd.DataFrame(np.random.randn(N, 3))
        with pd.HDFStore(fn, mode='w') as store:
            store.append('/x', df)

        ddf = dd.read_hdf(fn, '/x', chunksize=2)
        assert len(ddf) == N
Пример #12
0
def test_read_hdf():
    pytest.importorskip('tables')
    df = pd.DataFrame({'x': ['a', 'b', 'c', 'd'],
                       'y': [1, 2, 3, 4]}, index=[1., 2., 3., 4.])
    with tmpfile('h5') as fn:
        df.to_hdf(fn, '/data')
        try:
            dd.read_hdf(fn, 'data', chunksize=2)
            assert False
        except TypeError as e:
            assert "format='table'" in str(e)

    with tmpfile('h5') as fn:
        df.to_hdf(fn, '/data', format='table')
        a = dd.read_hdf(fn, '/data', chunksize=2)
        assert a.npartitions == 2
        assert a._known_dtype

        tm.assert_frame_equal(a.compute(), df)

        tm.assert_frame_equal(
              dd.read_hdf(fn, '/data', chunksize=2, start=1, stop=3).compute(),
              pd.read_hdf(fn, '/data', start=1, stop=3))

        assert sorted(dd.read_hdf(fn, '/data').dask) == \
               sorted(dd.read_hdf(fn, '/data').dask)
Пример #13
0
def test_hdf_file_list():
    pytest.importorskip('tables')
    df = pd.DataFrame({'x': ['a', 'b', 'c', 'd'],
                       'y': [1, 2, 3, 4]}, index=[1., 2., 3., 4.])

    with tmpdir() as tdir:
        df.iloc[:2].to_hdf(os.path.join(tdir, 'one.h5'), 'dataframe', format='table')
        df.iloc[2:].to_hdf(os.path.join(tdir, 'two.h5'), 'dataframe', format='table')

        with dask.set_options(get=dask.get):
            input_files = [os.path.join(tdir, 'one.h5'), os.path.join(tdir, 'two.h5')]
            res = dd.read_hdf(input_files, 'dataframe')
            tm.assert_frame_equal(res.compute(), df)
Пример #14
0
def test_read_hdf_multiple():
    pytest.importorskip('tables')
    df = pd.DataFrame({'x': ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
                             'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p'],
                       'y': [1, 2, 3, 4, 5, 6, 7, 8, 9,
                             10, 11, 12, 13, 14, 15, 16]},
                            index=[1., 2., 3., 4., 5., 6., 7., 8., 9.,
                                   10., 11., 12., 13., 14., 15., 16.])
    a = dd.from_pandas(df, 16)

    with tmpfile('h5') as fn:
        a.to_hdf(fn, '/data*')
        r = dd.read_hdf(fn, '/data*', sorted_index=True)
        assert a.npartitions == r.npartitions
        assert a.divisions == r.divisions
        eq(a, r)
Пример #15
0
    def load_datasets(self,outofcore):
        data_path = self.config['file']
        print('Loading Data from {}...'.format(data_path))

        if not path.isabs(data_path):
            config_dir = path.split(self.config_path)[0]
            data_path = path.join(config_dir, data_path)

        if not path.exists(data_path):
            raise IOError('Unable to find input dataset: "{}"'.format(data_path))

        axes_fields = []
        for f in self.axes.values():
            axes_fields += [f[1], f[2]]

        load_fields = [f for f in self.fields.values() if f is not None] + axes_fields

        if data_path.endswith(".csv"):
            self.df = pd.read_csv(data_path, usecols=load_fields)

            # parse categorical fields
            for f in self.categorical_fields:
                self.df[f] = self.df[f].astype('category')

        elif data_path.endswith(".castra"):
            import dask.dataframe as dd
            self.df = dd.from_castra(data_path)
            if not outofcore:
                self.df = self.df.cache(cache=dict)

        elif data_path.endswith(".hdf"):
            import dask.dataframe as dd
            self.df = dd.read_hdf(data_path, key="census")
            if not outofcore:
                self.df = self.df.cache(cache=dict)

        else:
            raise IOError("Unknown data file type; .csv and .castra currently supported")
Пример #16
0
def test_read_hdf_start_stop_values():
    pytest.importorskip('tables')
    df = pd.DataFrame({'x': ['a', 'b', 'c', 'd'],
                       'y': [1, 2, 3, 4]}, index=[1., 2., 3., 4.])
    with tmpfile('h5') as fn:
        df.to_hdf(fn, '/data', format='table')

        with pytest.raises(ValueError) as e:
            dd.read_hdf(fn, '/data', stop=10)
        assert 'number of rows' in str(e)

        with pytest.raises(ValueError) as e:
            dd.read_hdf(fn, '/data', start=10)
        assert 'is above or equal to' in str(e)

        with pytest.raises(ValueError) as e:
            dd.read_hdf(fn, '/data', chunksize=-1)
        assert 'positive integer' in str(e)
Пример #17
0
def test_read_hdf():
    pytest.importorskip("tables")
    df = pd.DataFrame({"x": ["a", "b", "c", "d"], "y": [1, 2, 3, 4]}, index=[1.0, 2.0, 3.0, 4.0])
    with tmpfile("h5") as fn:
        df.to_hdf(fn, "/data")
        try:
            dd.read_hdf(fn, "/data", chunksize=2)
            assert False
        except TypeError as e:
            assert "format='table'" in str(e)

    with tmpfile("h5") as fn:
        df.to_hdf(fn, "/data", format="table")
        a = dd.read_hdf(fn, "/data", chunksize=2)
        assert a.npartitions == 2

        tm.assert_frame_equal(a.compute(), df)

        tm.assert_frame_equal(
            dd.read_hdf(fn, "/data", chunksize=2, start=1, stop=3).compute(), pd.read_hdf(fn, "/data", start=1, stop=3)
        )

        assert sorted(dd.read_hdf(fn, "/data").dask) == sorted(dd.read_hdf(fn, "/data").dask)
def test_read_hdf(data, compare):
    pytest.importorskip('tables')
    with tmpfile('h5') as fn:
        data.to_hdf(fn, '/data')
        try:
            dd.read_hdf(fn, 'data', chunksize=2, mode='r')
            assert False
        except TypeError as e:
            assert "format='table'" in str(e)

    with tmpfile('h5') as fn:
        data.to_hdf(fn, '/data', format='table')
        a = dd.read_hdf(fn, '/data', chunksize=2, mode='r')
        assert a.npartitions == 2

        compare(a.compute(), data)

        compare(dd.read_hdf(fn, '/data', chunksize=2, start=1, stop=3,
                            mode='r').compute(),
                pd.read_hdf(fn, '/data', start=1, stop=3))

        assert (sorted(dd.read_hdf(fn, '/data', mode='r').dask) ==
                sorted(dd.read_hdf(fn, '/data', mode='r').dask))
Пример #19
0
def au_data_frame(directory: str) -> df:
    return df.read_hdf(os.path.join(directory, 'hdfs', 'au_*.hdf'), '/data')
Пример #20
0
                        help='print debugging information',
                        action='store_true',
                        required=False)
    args = parser.parse_args()

    finput = args.paired_data
    reg = args.regulatory
    startdate = args.startdate
    enddate = args.enddate
    species = args.species
    subset_giorgi = args.subset_giorgi
    giorgi_regions = args.giorgi_regions
    verbose = args.verbose

    for ee in giorgi_regions:
        df = dd.read_hdf(finput, '/*').compute()
        mapping_table = {'pm25_ugm3': 'sfc_pm25', 'pm10_ugm3': 'sfc_pm10'}
        sub_map = {i: mapping_table[i] for i in species if i in mapping_table}
        #subsetting data for dates, regulatory calc, and/or giorgi regions
        if startdate != None and enddate != None:
            mask = (df['time'] >= startdate) & (df['time'] <= enddate)
            df = df.loc[mask]
            import datetime
            startdatename_obj = datetime.datetime.strptime(
                startdate, '%Y-%m-%d %H:%M:%S')
            enddatename_obj = datetime.datetime.strptime(
                enddate, '%Y-%m-%d %H:%M:%S')
            startdatename = str(
                datetime.datetime.strftime(startdatename_obj, '%Y-%m-%d_%H'))
            enddatename = str(
                datetime.datetime.strftime(enddatename_obj, '%Y-%m-%d_%H'))
Пример #21
0
 def time_read_hdf5(self, scheduler):
     (dd.read_hdf('{}/*.hdf5'.format(self.data_dir),
                  'key').compute(scheduler=scheduler))
Пример #22
0
        del data_test  # del not needed file

        return test_transformed


if __name__ == "__main__":

    # init class
    start_time = time.time()
    data = LoadBigCsvFile(train, test).read_data()
    gc.collect()
    print('class loaded in %s seconds' % (time.time() - start_time))

    time.sleep(1)  # set some time gap

    # save to hdf for later use or modification
    start_time = time.time()
    data.to_hdf('test_proc.hdf', key='df1')
    print('file saved in hdf in %s seconds' % (time.time() - start_time))

    time.sleep(1)  # set some time gap
    print()
    # check the file and its content
    start_time = time.time()
    hdf_read = dask.read_hdf('test_proc.hdf',
                             key='df1',
                             mode='r',
                             chunksize=10000)
    print('file load into system in %s seconds' % (time.time() - start_time))
    print(hdf_read.head(3))
Пример #23
0
def find_scores(patient_dir: str, refresh=True):
    """
    Finds the scores for a specific patient directory

    :param patient_dir: Directory to look in
    """

    if not refresh and 'au_w_anno.hdf' in os.listdir(
            os.path.join(patient_dir, 'hdfs')):
        return

    try:
        patient, day, session = patient_day_session(patient_dir)
        try:
            au_frame = df.read_hdf(os.path.join(patient_dir, 'hdfs', 'au.hdf'),
                                   '/data')
        except ValueError as e:
            print(e)
            return

        # except ValueError as e:
        # print(e)

        # return

        if 'frame' not in au_frame.columns:
            return

        annotated_values = ["N/A" for _ in range(len(au_frame.index))]

        # here are the hand annotations
        csv_path = os.path.join('/home/emil/emotion_annotations',
                                patient_dir.replace('cropped', 'emotions.csv'))

        # video length is the same as length of the corresponding AU file
        num_frames = len(annotated_values)
        if num_frames != len(au_frame):
            print('this is wrong')
            print(num_frames)
            print(len(au_frame))
            exit()
        # find annotations, if exist. Else just leave the nans
        if os.path.exists(csv_path):
            csv_dict = csv_emotion_reader(csv_path)

            if csv_dict:
                annotated_ratio = int(num_frames / len(csv_dict))
                if annotated_ratio > 1:
                    print('HELLO HERE IS SUCH A CASE:', patient_dir)
                    print('num_frames:', num_frames)
                    print('len of annots:', len(csv_dict))
                if annotated_ratio == 0:
                    annotated_ratio = 1
                csv_dict = {
                    i * annotated_ratio: c
                    for i, c in csv_dict.items()
                }

                for i in [
                        x for x in csv_dict.keys() if 'None' not in csv_dict[x]
                ]:
                    to_write = clean_to_write(csv_dict[i])

                    if i in range(len(annotated_values)):
                        annotated_values[i] = to_write
        # au_frame = au_frame.assign(annotated=annotated_values)
        # au_frame = au_frame.set_index('frame')
        # au_frame["annotated"] = df.from_array(da.from_array(annotated_values, chunks=5))
        annotated_values = da.from_array(annotated_values,
                                         chunks='auto').compute()

        # what we know: au_frame['frame'] starts at 1, goes to (including) 3604
        # annotated_values has length we want, but currently (with the +1) a length of 3605
        # au_frame has a length of 3604 (makes sense, 1-3604)

        au_frame = au_frame.compute()
        au_frame = au_frame.assign(
            annotated=lambda x: annotated_values[x['frame'] - 1])

        au_frame.to_hdf(os.path.join(patient_dir, 'hdfs', 'au_w_anno.hdf'),
                        '/data',
                        format='table')

    except FileNotFoundError as not_found_error:
        print(not_found_error)

    except AttributeError as e:
        print(e)
import pandas as pd
import dask.dataframe as dd
import numpy as np

fname = 'testfile.h5'
key = '/group/dataset'

dtype = np.dtype([('idx','i4'),('val_a','f8'),('val_b','f8')])
N = 10000
cs = 1000
#store = pd.HDFStore(output_fname, mode='w')
with pd.HDFStore(fname, mode='w') as store:
    recarray = np.empty(N, dtype)
    df = pd.DataFrame.from_records(recarray)
    store.append(key, df)

df = pd.read_hdf(fname, key)
ddf = dd.read_hdf(fname, key, chunksize=cs)
print(len(df))
print(len(ddf))
Пример #25
0
def test_to_hdf():
    pytest.importorskip('tables')
    df = pd.DataFrame({'x': ['a', 'b', 'c', 'd'],
                       'y': [1, 2, 3, 4]}, index=[1., 2., 3., 4.])
    a = dd.from_pandas(df, 2)

    with tmpfile('h5') as fn:
        a.to_hdf(fn, '/data')
        out = pd.read_hdf(fn, '/data')
        tm.assert_frame_equal(df, out[:])

    with tmpfile('h5') as fn:
        a.x.to_hdf(fn, '/data')
        out = pd.read_hdf(fn, '/data')
        tm.assert_series_equal(df.x, out[:])

    a = dd.from_pandas(df, 1)
    with tmpfile('h5') as fn:
        a.to_hdf(fn, '/data')
        out = pd.read_hdf(fn, '/data')
        tm.assert_frame_equal(df, out[:])

    # saving to multiple datasets
    a = dd.from_pandas(df, 2)
    with tmpfile('h5') as fn:
        a.to_hdf(fn, '/data*')
        out = dd.read_hdf(fn, '/data*')
        tm.assert_frame_equal(df, out.compute())

    # saving to multiple files
    a = dd.from_pandas(df, 2)
    with tmpdir() as dn:
        fn = os.path.join(dn, 'data_*.h5')
        a.to_hdf(fn, '/data')
        out = dd.read_hdf(fn, '/data')
        tm.assert_frame_equal(df, out.compute())

    # saving to multiple datasets with custom name_function
    a = dd.from_pandas(df, 2)
    with tmpfile('h5') as fn:
        a.to_hdf(fn, '/data_*', name_function=lambda i: 'a' * (i +  1))
        out = dd.read_hdf(fn, '/data_*')
        tm.assert_frame_equal(df, out.compute())

        out = pd.read_hdf(fn, '/data_a')
        tm.assert_frame_equal(out, df.iloc[:2])
        out = pd.read_hdf(fn, '/data_aa')
        tm.assert_frame_equal(out, df.iloc[2:])

    # saving to multiple files with custom name_function
    a = dd.from_pandas(df, 2)
    with tmpdir() as dn:
        fn = os.path.join(dn, 'data_*.h5')
        a.to_hdf(fn, '/data', name_function=lambda i: 'a' * (i +  1))
        out = dd.read_hdf(fn, '/data')
        tm.assert_frame_equal(df, out.compute())

        out = pd.read_hdf(os.path.join(dn, 'data_a.h5'), '/data')
        tm.assert_frame_equal(out, df.iloc[:2])
        out = pd.read_hdf(os.path.join(dn, 'data_aa.h5'), '/data')
        tm.assert_frame_equal(out, df.iloc[2:])

    # saving to different datasets in multiple files with custom name_function
    a = dd.from_pandas(df, 2)
    with tmpdir() as dn:
        with pytest.raises(ValueError):
            fn = os.path.join(dn, 'data_*.h5')
            a.to_hdf(fn, '/data_*', name_function=lambda i: 'a' * (i +  1))
Пример #26
0
def process_vid_dir(eyebrow_dict: dict, vid_dir: str) -> None:
    # all_dict_file = os.path.join(vid_dir, 'all_dict.txt')
    patient_name = vid_dir.split('_')[0]
    all_dict_folder = ('all_' + patient_name)

    already_ran_file = os.path.join(vid_dir, 'already_ran.txt')

    diff_dict = json.load(
        open(already_ran_file)) if os.path.exists(already_ran_file) else {}

    if vid_dir not in diff_dict:
        diff_dict[vid_dir] = {}

    emotion_frame = df.read_hdf(os.path.join(
        all_dict_folder, 'hdfs'
        '*.hdf'), '/data') if os.path.exists(
            all_dict_folder) else AUScorer.au_data_frame(vid_dir)
    # emotion_dict = AUScorer.convert_dict_to_int(
    # json.load(open(all_dict_file))) if os.path.exists(
    # all_dict_file) else AUScorer.AUScorer(vid_dir).presence_dict

    include_eyebrows = eyebrow_dict and vid_dir in eyebrow_dict['Eyebrows']
    pre_func_list = [(re_crop_vid_dir, 're_crop'),
                     (throw_vid_in_reverse, 'reverse'),
                     (reverse_re_crop_vid_dir, 'reverse_re_crop')]

    post_func_list = [(invert_colors, 'invert_colors'),
                      (lower_gamma, 'low_gamma'),
                      (increase_gamma, 'high_gamma')]

    dir_list = [name for _, name in pre_func_list + post_func_list]

    to_do_list = [x for _, x in pre_func_list if x not in diff_dict[vid_dir]]

    for func, name in pre_func_list + post_func_list:
        if name not in diff_dict[vid_dir]:
            post_func_frame = func(get_vid_from_dir(vid_dir), vid_dir,
                                   include_eyebrows)
            update_frames(post_func_frame=post_func_frame,
                          emotion_frame=emotion_frame,
                          diff_dict=diff_dict,
                          vid_dir=vid_dir,
                          name=name,
                          func_name='as-is')

    for pre_dir in to_do_list:
        if os.path.exists(os.path.join(vid_dir, pre_dir)):
            if pre_dir not in diff_dict[vid_dir]:
                diff_dict[vid_dir][pre_dir] = {}

            for func, name in post_func_list:
                if name not in diff_dict[vid_dir][pre_dir]:
                    full_path = os.path.join(vid_dir, pre_dir)
                    post_func_frame = func(
                        glob.glob(os.path.join(full_path, '*.avi'))[0],
                        full_path, include_eyebrows)
                    update_frames(post_func_frame, emotion_frame, diff_dict,
                                  vid_dir, pre_dir, name)

    # json.dump(emotion_dict, open(all_dict_file, 'w'))
    json.dump(diff_dict, open(already_ran_file, 'w'))

    for pre_dir in dir_list:
        if os.path.exists(os.path.join(vid_dir, pre_dir)):
            shutil.rmtree(os.path.join(vid_dir, pre_dir))
Пример #27
0
    return (all_models_with_postValiMods)


# load ensemble weights
d = pickle.load(
    open(data_intermed_nb_fldrpath + "/postvali_ensemble_weights.p", "rb"))

ranked1_ensemble_weights = d["ranked1_ensemble_weights"]
ranked5_ensemble_weights = d["ranked5_ensemble_weights"]

del d

# create post-validation sample models on the validation sample
## load component models
all_models_dd = dd.read_hdf(all_models_store_path,
                            key="/all_models_Rehol_NotPlugged/valicast",
                            chunksize=dd_chunksize_valicast)

## add ranked 1 and ranked 5 predictions
all_models_with_postValiMods_dd = add_post_vali_mods(
    df_dd=all_models_dd,
    ensemble_weights=ranked1_ensemble_weights,
    ensemble_name="postValiMod_ens_ranked1")

all_models_with_postValiMods_dd = add_post_vali_mods(
    df_dd=all_models_with_postValiMods_dd,
    ensemble_weights=ranked5_ensemble_weights,
    ensemble_name="postValiMod_ens_ranked5")

all_models_with_postValiMods_dd = all_models_with_postValiMods_dd[[
    "daily_level", "daily_untouched", "postValiMod_ens_ranked1",
Пример #28
0
def test_to_hdf_multiple_datasets():
    df = pd.DataFrame({
        'x': ['a', 'b', 'c', 'd'],
        'y': [1, 2, 3, 4]
    },
                      index=[1., 2., 3., 4.])
    a = dd.from_pandas(df, 2)
    df16 = pd.DataFrame(
        {
            'x': [
                'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l',
                'm', 'n', 'o', 'p'
            ],
            'y': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
        },
        index=[
            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
            16.
        ])
    b = dd.from_pandas(df16, 16)

    # saving to multiple datasets making sure order is kept
    with tmpfile('h5') as fn:
        b.to_hdf(fn, '/data*')
        out = dd.read_hdf(fn, '/data*')
        eq(df16, out)

    # saving to multiple datasets
    with tmpfile('h5') as fn:
        a.to_hdf(fn, '/data*')
        out = dd.read_hdf(fn, '/data*')
        eq(df, out)

    # saving to multiple files
    with tmpdir() as dn:
        fn = os.path.join(dn, 'data_*.h5')
        a.to_hdf(fn, '/data')
        out = dd.read_hdf(fn, '/data')
        eq(df, out)

    # saving to multiple datasets with custom name_function
    with tmpfile('h5') as fn:
        a.to_hdf(fn, '/data_*', name_function=lambda i: 'a' * (i + 1))
        out = dd.read_hdf(fn, '/data_*')
        eq(df, out)

        out = pd.read_hdf(fn, '/data_a')
        tm.assert_frame_equal(out, df.iloc[:2])
        out = pd.read_hdf(fn, '/data_aa')
        tm.assert_frame_equal(out, df.iloc[2:])

    # saving to multiple files with custom name_function
    with tmpdir() as dn:
        fn = os.path.join(dn, 'data_*.h5')
        a.to_hdf(fn, '/data', name_function=lambda i: 'a' * (i + 1))
        out = dd.read_hdf(fn, '/data')
        eq(df, out)

        out = pd.read_hdf(os.path.join(dn, 'data_a.h5'), '/data')
        tm.assert_frame_equal(out, df.iloc[:2])
        out = pd.read_hdf(os.path.join(dn, 'data_aa.h5'), '/data')
        tm.assert_frame_equal(out, df.iloc[2:])

    # saving to different datasets in multiple files with custom name_function
    with tmpdir() as dn:
        with pytest.raises(ValueError):
            fn = os.path.join(dn, 'data_*.h5')
            a.to_hdf(fn, '/data_*', name_function=lambda i: 'a' * (i + 1))

    # test hdf object
    with tmpfile('h5') as fn:
        with pd.HDFStore(fn) as hdf:
            a.to_hdf(hdf, '/data*')
            out = dd.read_hdf(fn, '/data*')
            eq(df, out)
Пример #29
0
def test_to_hdf_schedulers(scheduler, npartitions):
    pytest.importorskip("tables")
    df = pd.DataFrame(
        {
            "x": [
                "a",
                "b",
                "c",
                "d",
                "e",
                "f",
                "g",
                "h",
                "i",
                "j",
                "k",
                "l",
                "m",
                "n",
                "o",
                "p",
            ],
            "y": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
        },
        index=[
            1.0,
            2.0,
            3.0,
            4.0,
            5.0,
            6.0,
            7.0,
            8.0,
            9.0,
            10.0,
            11.0,
            12.0,
            13.0,
            14.0,
            15.0,
            16.0,
        ],
    )
    a = dd.from_pandas(df, npartitions=npartitions)

    # test single file single node
    with tmpfile("h5") as fn:
        a.to_hdf(fn, "/data", scheduler=scheduler)
        out = pd.read_hdf(fn, "/data")
        assert_eq(df, out)

    # test multiple files single node
    with tmpdir() as dn:
        fn = os.path.join(dn, "data_*.h5")
        a.to_hdf(fn, "/data", scheduler=scheduler)
        out = dd.read_hdf(fn, "/data")
        assert_eq(df, out)

    # test single file multiple nodes
    with tmpfile("h5") as fn:
        a.to_hdf(fn, "/data*", scheduler=scheduler)
        out = dd.read_hdf(fn, "/data*")
        assert_eq(df, out)
Пример #30
0
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler, Imputer
from sklearn.model_selection import LeaveOneGroupOut, cross_val_predict
from zillow import modelling
import pickle as pkl
import dask.dataframe as dd

test_df = dd.read_hdf("input/test_20172.*.hdf", "data")

with open("input/feat_names.pkl", "rb") as f:
    feat_names = pkl.load(f)

with open("input/feat_names_both.pkl", "rb") as f:
    feat_names_both = pkl.load(f)

test_df = test_df

with open("input/encoders.pkl", "rb") as f:
    encoders = pkl.load(f)

for col, encoder in encoders.items():
    print(encoder.classes_)
    print(test_df[col].head(1000, npartitions=10).unique())
    test_df[col] = test_df.map_partitions(lambda x: encoder.transform(
        list(x[col].replace(np.nan, encoder.classes_[0]).values)),
                                          meta=pd.Series(dtype=np.float32))

with open("input/log_cols.pkl", "rb") as f:
    log_cols = pkl.load(f)
Пример #31
0
def sumTokenCounts(stores,data):
	max_str_bytes = 50
	chunksize = 100000
	batch_limit = 6*10**8
	savestore = data + "final/fromnodes-323.h5"

	for storefile in stores:
		print(storefile)
		logging.info("Next store: %s" % storefile)
		try:
			# Get Unique languages
			with pd.HDFStore(storefile, complevel=9, mode="a", complib='blosc') as store:
				langs = set([key.split("/", maxsplit=-1)[-1] for key in store.keys() if 'merged1' in key])
		except:
			logging.exception("Can't read languages from %s" % storefile)
			continue

		for lang in langs:
			batch = False
			logging.info("Starting lang %s from %s" % (lang, storefile))
			print(lang)

			if not re.match('[a-z]{3}', lang):
				logging.error("lang '%s' is not three alphanumeric characters. Skipping for now. (%s)" % (lang, storefile))
				continue

			try:
				ddf = dd.read_hdf(storefile, '/merged1/'+lang, chunksize=chunksize, mode='r')
			except:
				logging.exception("Can't load Dask DF for %s in %s" % (lang, storefile))
				continue

			# Assuming partitions are equally sized, which they should be if read from a single file
			if ddf.npartitions > np.ceil(batch_limit/chunksize):
				batch = True
				niters = np.floor((ddf.npartitions*chunksize)/batch_limit)
				i = 0

			while True:
				if batch:
					start = i * batch_limit
					logging.info("Starting batch %d for %s" % (i, lang))
					if i == niters:
						# Last batch, no stop value
						ddf = dd.read_hdf(storefile, '/merged1/'+lang, chunksize=chunksize, start=start)
						batch = False
					else:
						ddf = dd.read_hdf(storefile, '/merged1/'+lang, chunksize=chunksize,start=start, stop=(start+batch_limit))
						i += 1
				try:
					logging.info("Starting full merge for %s with %d partitions" % (lang, ddf.npartitions))
					with ProgressBar():
						full_merge = ddf.reset_index().groupby('token').sum().compute()
					#if lang == 'eng':
						# For curiosity: see the profiling for English
					#    prof.visualize()
					logging.info("Success! Saving merged.")
					# The /fromnodes table is the sum from all the different stores, but will need to be summed one more time
					with pd.HDFStore(savestore, complevel=9, mode="a", complib='blosc') as store:
						store.append(lang,full_merge,data_columns=['count'],min_itemsize = {'index': max_str_bytes})
				except:
					logging.exception("Can't compute or save lang for %s in %s" % (lang, storefile))

				if batch == False:
					break
    'axes.labelsize': 'xx-large',
    'axes.titlesize': 'xx-large',
    'xtick.labelsize': 'xx-large',
    'ytick.labelsize': 'xx-large'
}
pylab.rcParams.update(params)

w_dir = 'D:/BIG DATA'
directory_load = 'Merged Files'
directory_save = 'Strategy Reversal'

os.chdir(w_dir)

ccys = ['EURUSD', 'EURCHF', 'EURGBP', 'EURJPY', 'EURAUD']

df = dd.read_hdf(directory_load + '/' + ccys[1] + '-' + str(2014) + '.h5',
                 ccys[1] + str(2014) + '05')
plt.figure()
plt.plot(df.compute())

#%%
with open(directory_save + '/Report.txt', 'r') as fout:
    file = fout.read()

#%%
lines = file.split()

read_ccy = False
read_PL = False

performance_usd = []
performance_chf = []
Пример #33
0
    def create_unigram_book_counts(self,
                                   newtable=True,
                                   ingest=True,
                                   index=True,
                                   reverse_index=True,
                                   table_count=1):
        import time
        t0 = time.time()

        db = self.db
        ngramname = "unigrams"
        tablenameroot = "master_bookcounts"
        # If you are splitting the input into multiple tables
        # to be joined as a merge table, come up with multiple
        # table names and we'll cycle through.
        if table_count == 1:
            tablenames = [tablenameroot]
        elif table_count > 1:
            tablenames = [
                "%s_p%d" % (tablenameroot, i)
                for i in range(1, table_count + 1)
            ]
        else:
            logging.error("You need a positive integer for table_count")
            raise

        grampath = ".bookworm/texts/encoded/%s" % ngramname
        tmpdir = "%s/tmp" % grampath

        if (len(grampath) == 0) or (grampath == "/"):
            logging.error(
                "Woah! Don't set the ngram path to your system root!")
            raise

        if newtable:
            if os.path.exists(tmpdir):
                import shutil
                shutil.rmtree(tmpdir)

            logging.info("Dropping older %s table, if it exists" % ngramname)
            for tablename in tablenames:
                db.query("DROP TABLE IF EXISTS " + tablename)

        logging.info("Making a SQL table to hold the %s" % ngramname)
        reverse_index_sql = "INDEX(bookid,wordid,count), " if reverse_index else ""
        for tablename in tablenames:
            db.query(
                "CREATE TABLE IF NOT EXISTS " + tablename + " ("
                "bookid MEDIUMINT UNSIGNED NOT NULL, " + reverse_index_sql +
                "wordid MEDIUMINT UNSIGNED NOT NULL, INDEX(wordid,bookid,count), "
                "count MEDIUMINT UNSIGNED NOT NULL);")

        if ingest:
            for tablename in tablenames:
                db.query("ALTER TABLE " + tablename + " DISABLE KEYS")
            db.query("set NAMES utf8;")
            db.query("set CHARACTER SET utf8;")
            logging.info("loading data using LOAD DATA LOCAL INFILE")

            files = os.listdir(grampath)
            for i, filename in enumerate(files):
                if filename.endswith('.txt'):
                    # With each input file, cycle through each table in tablenames
                    tablename = tablenames[i % len(tablenames)]
                    logging.debug("Importing txt file, %s (%d/%d)" %
                                  (filename, i, len(files)))
                    try:
                        db.query("LOAD DATA LOCAL INFILE '" + grampath + "/" +
                                 filename + "' INTO TABLE " + tablename +
                                 " CHARACTER SET utf8 (bookid,wordid,count);")
                    except KeyboardInterrupt:
                        raise
                    except:
                        logging.debug(
                            "Falling back on insert without LOCAL DATA INFILE. Slower."
                        )
                        try:
                            import pandas as pd
                            df = pd.read_csv(grampath + "/" + filename,
                                             sep='\t',
                                             header=None)
                            to_insert = df.apply(tuple, axis=1).tolist()
                            db.query("INSERT INTO " + tablename +
                                     " (bookid,wordid,count) "
                                     "VALUES (%s, %s, %s);"
                                     "",
                                     many_params=to_insert)
                        except KeyboardInterrupt:
                            raise
                        except:
                            logging.exception("Error inserting %s from %s" %
                                              (ngramname, filename))
                            continue

                elif filename.endswith('.h5'):
                    logging.info("Importing h5 file, %s (%d/%d)" %
                                 (filename, i, len(files)))
                    try:
                        # When encountering an .h5 file, this looks for ngram information
                        # in a /#{ngramnames} table (e.g. /unigrams) and writes it out to
                        # temporary TSV files.
                        # Dask is used here simply because it's a dead simple way to multithread
                        # the TSV writing and lower the overhead versus having a TSV already staged.
                        import csv
                        import pandas as pd
                        try:
                            import dask.dataframe as dd
                        except:
                            logging.exception(
                                "Ingesting h5 files requires dask")
                        try:
                            os.makedirs(tmpdir)
                        except OSError:
                            if not os.path.isdir(tmpdir):
                                raise
                        # Dask will use #{n_cores-1} threads when saving CSVs.
                        # Ingest and key reload times are identical to txt import, so the only
                        # additional overhead is reading the file (small effect) and writing the csv.
                        ddf = dd.read_hdf(grampath + "/" + filename,
                                          ngramname,
                                          mode='r',
                                          chunksize=2000000)
                        ddf.reset_index().to_csv(tmpdir + '/tmp.*.tsv',
                                                 index=False,
                                                 sep='\t',
                                                 header=False,
                                                 quoting=csv.QUOTE_NONNUMERIC)
                        logging.info(
                            "CSV written from H5. Time passed: %.2f s" %
                            (time.time() - t0))
                        for j, tmpfile in enumerate(os.listdir(tmpdir)):
                            # With each input file, cycle through each table in tablenames
                            tablename = tablenames[j % len(tablenames)]
                            path = "%s/%s" % (tmpdir, tmpfile)
                            db.query(
                                "LOAD DATA LOCAL INFILE '" + path + "' "
                                "INTO TABLE " + tablename + " "
                                "CHARACTER SET utf8 (bookid,wordid,count);")
                            try:
                                os.remove(path)
                            except:
                                pass
                        logging.info("CSVs input. Time passed: %.2f s" %
                                     (time.time() - t0))
                    except KeyboardInterrupt:
                        raise
                    except:
                        logging.exception("Error inserting %s from %s" %
                                          (ngramname, filename))
                        continue
                else:
                    continue
        if index:
            logging.info("Creating Unigram Indexes. Time passed: %.2f s" %
                         (time.time() - t0))
            for tablename in tablenames:
                db.query("ALTER TABLE " + tablename + " ENABLE KEYS")

            if table_count > 1:
                logging.info("Creating a merge table for " +
                             ",".join(tablenames))
                db.query(
                    "CREATE TABLE IF NOT EXISTS " + tablenameroot + " ("
                    "bookid MEDIUMINT UNSIGNED NOT NULL, " +
                    reverse_index_sql +
                    "wordid MEDIUMINT UNSIGNED NOT NULL, INDEX(wordid,bookid,count), "
                    "count MEDIUMINT UNSIGNED NOT NULL) "
                    "ENGINE=MERGE UNION=(" + ",".join(tablenames) +
                    ") INSERT_METHOD=LAST;")

        logging.info("Unigram index created in: %.2f s" % ((time.time() - t0)))
Пример #34
0
    def score(self,
              spaceagg,
              timeagg,
              store_minimum=False,
              pp_model=None,
              quantile=''):
        """
        Read the obs and clim. make a comparison object which computes the scores in the dask dataframe. 
        This dask dataframe is exported.
        Returns a list with intermediate filenames of the raw, climatological and corrected scores.
        Has a post-processing step if the pp_model is supplied. Fit is the same regardless of the quantile, so done only once.
        If there are no quantiles to predict or binary variable, we force equidistant sampling (random = True led to overestimations of the crps)
        """
        alignment = ForecastToObsAlignment(season=self.season,
                                           cycle=self.cycle)
        alignment.recollect(booksname=self.log.loc[(spaceagg, timeagg),
                                                   ('booksname', '')])

        climatology = Climatology(
            self.basevar, **{
                'name': self.log.loc[(spaceagg, timeagg),
                                     ('climname', quantile)]
            })
        climatology.localclim(
        )  # loading in this case. Creation was done in the makeclim method.

        if not self.log.loc[(spaceagg, timeagg), (
                'modelclimname', [quantile]
        )].isna().any(
        ):  # Supply model quantile climatology if that was computed earlier. Will be preferred for the raw briescoring in the comparison Class
            modelclimatology = ModelClimatology(
                cycle=self.cycle,
                variable=self.basevar,
                **{
                    'name':
                    self.log.loc[(spaceagg, timeagg),
                                 ('modelclimname', quantile)]
                })
            modelclimatology.local_clim()
            assert self.newvar == 'anom', 'This modelclimatology has likely no adapted units, only when anomalies the quantiles in Kelvin will be compatible with the aligned forecast anomalies in Celsius.'
        else:
            modelclimatology = None

        comp = Comparison(alignment=alignment,
                          climatology=climatology,
                          modelclimatology=modelclimatology)

        # Fitting or accepting external fits (meaning the column is already filled):
        if not pp_model is None:
            if not isinstance(
                    self.log.loc[(spaceagg, timeagg),
                                 ('externalfits', quantile)], str):
                comp.fit_pp_models(pp_model=pp_model,
                                   groupers=['leadtime', 'clustid'])
                firstfitname = comp.export(fits=True, frame=False)
                self.log.loc[(spaceagg, timeagg), (
                    'externalfits', slice(None)
                )] = firstfitname  # Specifically useful for the looping over quantiles.
            else:
                fitname = self.log.loc[(spaceagg, timeagg),
                                       ('externalfits', quantile)]
                print('loading fit from:', fitname)
                comp.fits = dd.read_hdf(
                    comp.basedir + fitname + '.h5',
                    key='fits')  # Loading of the fits of the first quantile.
                comp.fitgroupers = ['leadtime', 'clustid']

        # Going to the scoring.
        if isinstance(quantile, float):
            if not pp_model is None:
                comp.make_pp_forecast(pp_model=pp_model)
            comp.brierscore()
        else:
            if not pp_model is None:
                comp.make_pp_forecast(pp_model=pp_model,
                                      random=False,
                                      n_members=self.ndraws if isinstance(
                                          pp_model, NGR) else None)
                comp.export(fits=False, frame=False, preds=True)
            if (self.newvar is None) or (self.newvar == 'anom'):
                comp.crpsscore()
            else:  # Meaning a custom binary predictand
                comp.brierscore()

        scorefile = comp.export(fits=False,
                                frame=True,
                                store_minimum=store_minimum)

        return (scorefile)
Пример #35
0
def h5_load_range_by_coord(db_path,
                           table,
                           range_coordinates: Optional[Sequence] = None,
                           columns=None,
                           chunksize=None,
                           sorted_index=None,
                           **kwargs) -> dd.DataFrame:
    """
    Load (range by intenger indexes of) hdf5 data to dask dataframe
    :param range_coordinates: control/limit range of data loading:
        tuple of int, start and end indexes - limit returned dask dataframe by this range
        empty tuple - raise Ex_nothing_done
        None, to load all data
    :param cfg_in: dict, with fields:
    :param db_path, str
    :param table, str
        dask.read_hdf() parameters:
    :param chunksize,
    :param sorted_index (optional): bool, default True
    :param columns: passed without change to dask.read_hdf()
    """
    if sorted_index is None:
        sorted_index = True

    if range_coordinates is None:  # not specify start and stop.
        print("h5_load_range_by_coord(all)")
        # ?! This is only option in dask to load sorted index
        ddpart = dd.read_hdf(db_path,
                             table,
                             chunksize=chunksize,
                             lock=True,
                             mode='r',
                             columns=columns,
                             sorted_index=sorted_index)
    elif not len(range_coordinates):
        raise Ex_nothing_done('no data')
    else:
        ddpart_size = -np.subtract(*range_coordinates)
        if not ddpart_size:
            return dd.from_array(
                np.zeros(0, dtype=[
                    ('name', 'O'), ('index', 'M8')
                ]))  # DataFrame({},'NoData', {}, [])  # None
        # if ddpart_size < chunksize:
        #     chunksize = ddpart_size  # !? needed to not load more data than need
        # else:
        chunksize = ddpart_size  # !? else loads more data than needs. Do I need to adjust chunksize to divide ddpart_on equal parts?
        # sorted_index=cfg_in['sorted_index'] not works with start/stop so loading without

        for c in [False, True]:  # try with specified columns first
            try:
                ddpart = dd.read_hdf(db_path,
                                     table,
                                     chunksize=chunksize,
                                     lock=True,
                                     mode='r',
                                     columns=columns,
                                     start=range_coordinates[0],
                                     stop=range_coordinates[-1])
                break
            except KeyError:  # some of specified columns not exist
                # use only existed columns
                with pd.HDFStore(db_path, mode='r') as store:
                    columns = store[table].columns.join(columns, how='inner')
                print('found columns:', columns.values)

        # because of no 'sorted_index' we need:
        ddpart = ddpart.reset_index().set_index(ddpart.index.name or 'index',
                                                sorted=sorted_index)  # 'Time'
    return ddpart
Пример #36
0
import dask
from dask import dataframe as dd
from zillow.data_utils import add_features, add_date_features

if __name__ == "__main__":
    import sys
    arg = sys.argv[1]
    if arg not in ["train", "test", "train_2017", "test_2017"]:
        sys.exit(1)

    with dask.set_options(get=dask.get):
        print(r"input/{}.*.hdf".format(arg))
        df = dd.read_hdf(r"input/{}.*.hdf".format(arg),
                         "data",
                         chunksize=1000000)  #.set_index("ParcelId")
        df = add_features(df)
        df = add_date_features(df)
        print(df.head())
        df.to_hdf(r"input/{}2.*.hdf".format(arg), "data")
os.chdir(w_dir)

ccys = ['EURUSD', 'EURCHF', 'EURGBP', 'EURJPY', 'EURAUD']

import time

total_start_time = time.time()
descriptions = []

for ccy in ccys:
    for year in range(2003, 2017):

        start_time = time.time()

        df = dd.read_hdf(directory_load + '/' + ccy + '-' + str(year) + '.h5',
                         '*')

        with open(directory_load + '/' + 'Data Description.txt', 'a') as fout:
            description = df.describe().compute()
            descriptions.append([ccy, year, description])
            fout.write('\n'.join([
                '\n\n=======================================================',
                'Ccy: ' + ccy + '           Year: ' + str(year),
                str(description)
            ]))

        elapsed = time.time() - start_time
        with open(directory_load + '/' + 'Data Description.txt', 'a') as fout:
            fout.write('\nTime Elapsed: ' + str(np.round(elapsed, 2)))

total_elapsed = time.time() - total_start_time
Пример #38
0
#x = dataset[:,0:7] # T, P, x_N2, x_O2, x_NO, x_N, x_O
#y = dataset[:,7:]  # D_cidk upper triangular matrix (Dij | j=>i)
#x = df[:,0:7] # T, P, x_N2, x_O2, x_NO, x_N, x_O
#y = df[:,7:]  # D_cidk upper triangular matrix (Dij | j=>i)
#dataset.head()

df.head(10)
#import h5py
#import xarray as xr
import os
import time
#filename = os.path.join('data', 'accounts.*.csv')
#filename
#target = os.path.join('data', 'accounts.h5')
#target
df_hdf = dd.read_hdf('myh4file.h5', ' ')
df_hdf.head()

#f = h5py.File(os.path.join('.', 'myh4file.h5'), mode='r')

import time
import sys
sys.path.insert(0, '../../../Utilities/')

from plotting import newfig, savefig
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from mpl_toolkits.axes_grid1 import make_axes_locatable

import numpy as np
Пример #39
0
 def time_read_hdf5_meta(self, scheduler):
     dd.read_hdf('{}/*.hdf5'.format(self.data_dir), 'key')
Пример #40
0
sarima_ts_backtrans_store_path = data_intermed_nb_fldrpath + "/sarima_ts_backtrans_store.h5"
sarima_agg_backtrans_store_path = data_intermed_nb_fldrpath + "/sarima_agg_backtrans_store.h5"
summary_stats_store_path = data_intermed_nb_fldrpath + "/summary_stats_store.h5"
ts_store_path = data_intermed_nb_fldrpath + "/ts_store.h5"
component_models_store_path = data_intermed_nb_fldrpath + "/component_models_store.h5"

dask_tmpdir = "/media/disk1/forecast_wiki_traffic/data_intermed/tmp"

# compute amount to trim SARIMA estimates at
# (trim at max observed times a multiplier)
TRIM_MULTIPLIER = 10

## read in time series
ts_daily_long = dd.read_hdf(ts_store_path,
                            "/ts_daily_long",
                            chunksize=803 * 2048)

## get max per time series
trimmax = ts_daily_long.groupby("ts_id")["daily_level"].max().compute()
trimmax = trimmax.to_frame("trimmax")

## get max per time series with last 60 truncated
with pd.HDFStore(ts_store_path) as s:
    n = s.get_storer("/ts_daily_long").nrows
    t_max = s.select("/ts_daily_long", start=n - 1, stop=n)
    t_max = t_max.index.values[0][1]
    del n

ts_daily_long_trunc60 = ts_daily_long.reset_index()
ts_daily_long_trunc60 = ts_daily_long_trunc60[ts_daily_long_trunc60.time_d < (
Пример #41
0

target = os.path.join('data', 'accounts.h5')
target


# In[6]:


get_ipython().magic("time df_csv.to_hdf(target, '/data')")


# In[7]:


df_hdf = dd.read_hdf(target, '/data')
df_hdf.head()


# ### Compare CSV to HDF5 speeds

# We do a simple computation that requires reading a column of our dataset and compare performance between CSV files and our newly created HDF5 file.  Which do you expect to be faster?

# In[8]:


get_ipython().magic('time df_csv.amount.sum().compute()')


# In[9]:
Пример #42
0
def _load_basic_dataframe(df_file=None,
                          datatype='sim',
                          config='IC86.2012',
                          energy_reco=True,
                          energy_cut_key='reco_log_energy',
                          log_energy_min=None,
                          log_energy_max=None,
                          columns=None,
                          n_jobs=1,
                          verbose=False,
                          compute=True):

    validate_datatype(datatype)

    if df_file is not None:
        files = df_file
    else:
        paths = get_config_paths()
        file_pattern = os.path.join(paths.comp_data_dir, config, datatype,
                                    'processed_hdf',
                                    'nominal' if datatype == 'sim' else '',
                                    '*.hdf')
        files = sorted(glob.glob(file_pattern))

    ddf = dd.read_hdf(files,
                      key='dataframe',
                      mode='r',
                      columns=columns,
                      chunksize=10000)

    # Energy reconstruction
    if energy_reco:
        model_dict = load_trained_model(
            'linearregression_energy_{}'.format(config), return_metadata=True)
        pipeline = model_dict['pipeline']
        feature_list = list(model_dict['training_features'])

        def add_reco_energy(partition):
            partition['reco_log_energy'] = pipeline.predict(
                partition[feature_list])
            partition['reco_energy'] = 10**partition['reco_log_energy']
            return partition

        ddf = ddf.map_partitions(add_reco_energy)

    # Energy range cut
    if log_energy_min is not None and log_energy_max is not None:

        def apply_energy_cut(partition):
            energy_mask = (partition[energy_cut_key] > log_energy_min) & (
                partition[energy_cut_key] < log_energy_max)
            return partition.loc[energy_mask, :]

        ddf = ddf.map_partitions(apply_energy_cut)

    if compute:
        if verbose:
            pbar = ProgressBar()
            pbar.register()
        scheduler = 'processes' if n_jobs > 1 else 'synchronous'
        df = ddf.compute(scheduler=scheduler, num_workers=n_jobs)
        df = df.reset_index(drop=True)
    else:
        df = ddf

    return df
Пример #43
0
def prep_final_unadj_component_models(sarima_ts_backtrans_key,
                                      sarima_agg_backtrans_key,
                                      component_models_intermed_key,
                                      trimmax_var, seas_stat_mod_vars,
                                      chunksize):
    ## combine SARIMA estimates for both aggregates and time series levels
    ### prep time series level model intermediate estimates
    sarima_ts_backtrans = dd.read_hdf(sarima_ts_backtrans_store_path,
                                      sarima_ts_backtrans_key,
                                      chunksize=chunksize)
    sarima_ts_backtrans = sarima_ts_backtrans.reset_index()
    col_ren_dict = {
        "daily_level_predbt": "mod_ts_daily_level_Bt",
        "daily_wowGr_predbt": "mod_ts_daily_wowGr_Bt",
        "weekly_level_predbt": "mod_ts_weekly_level_Bt",
        "weekly_wowGr_predbt": "mod_ts_weekly_wowGr_Bt"
    }
    sarima_ts_backtrans = sarima_ts_backtrans.rename(columns=col_ren_dict)
    del col_ren_dict

    ### prep aggregate level model intermediate estimates
    sarima_agg_backtrans = dd.read_hdf(sarima_agg_backtrans_store_path,
                                       sarima_agg_backtrans_key,
                                       chunksize=chunksize)
    col_ren_dict = {
        "daily_level_shrtAdj_predbt": "mod_agg_daily_level_Bt",
        "daily_wowGr_shrtAdj_predbt": "mod_agg_daily_wowGr_Bt",
        "weekly_level_shrtAdj_predbt": "mod_agg_weekly_level_Bt",
        "weekly_wowGr_shrtAdj_predbt": "mod_agg_weekly_wowGr_Bt"
    }
    sarima_agg_backtrans = sarima_agg_backtrans.rename(columns=col_ren_dict)
    del col_ren_dict

    ### put together intermediate SARIMA estimates
    component_models_intermed = dd.merge(sarima_ts_backtrans,
                                         sarima_agg_backtrans,
                                         on=["ts_id", "time_d"])

    ## trim outliers from SARIMA estimates
    component_models_intermed = dd.merge(
        component_models_intermed,
        trimmax_df[[trimmax_var
                    ]].rename(columns={trimmax_var: "trimmax_this"}),
        on=["ts_id"],
        left_index=False,
        right_index=True)

    cols = [
        "mod_ts_daily_level_Bt", "mod_ts_daily_wowGr_Bt",
        "mod_ts_weekly_level_Bt", "mod_ts_weekly_wowGr_Bt",
        "mod_agg_daily_level_Bt", "mod_agg_daily_wowGr_Bt",
        "mod_agg_weekly_level_Bt", "mod_agg_weekly_wowGr_Bt"
    ]
    for v in cols:
        component_models_intermed[v + "Trim"] = component_models_intermed[[
            v, "trimmax_this"
        ]].min(axis=1, skipna=False)
    del cols

    ## combine summary stat estimates
    with pd.HDFStore(summary_stats_store_path, mode="r") as s:
        sstat = s.select("/ts_stat_dayofweek",
                         columns=list(seas_stat_mod_vars.keys()))
        sstat.rename(columns=seas_stat_mod_vars, inplace=True)
        sstat.reset_index(inplace=True)

    component_models_intermed[
        "day_of_week"] = component_models_intermed.time_d.dt.dayofweek

    component_models_intermed = dd.merge(component_models_intermed,
                                         sstat,
                                         on=["ts_id", "day_of_week"])

    del component_models_intermed["day_of_week"]

    # save results to h5
    component_models_intermed.to_hdf(component_models_store_path,
                                     key=component_models_intermed_key,
                                     compute=True,
                                     format="table",
                                     data_columns=["ts_id", "time_d"])

    # check counts
    with pd.HDFStore(component_models_store_path, mode="r") as s:
        n0 = s.get_storer(component_models_intermed_key).nrows

    with pd.HDFStore(sarima_ts_backtrans_store_path, mode="r") as s:
        n1 = s.get_storer(sarima_ts_backtrans_key).nrows

    with pd.HDFStore(sarima_agg_backtrans_store_path, mode="r") as s:
        n2 = s.get_storer(sarima_agg_backtrans_key).nrows

    assert (n0 == n1)
    assert (n1 == n2)
Пример #44
0
def test_to_hdf_lock_delays():
    pytest.importorskip("tables")
    df16 = pd.DataFrame(
        {
            "x": [
                "a",
                "b",
                "c",
                "d",
                "e",
                "f",
                "g",
                "h",
                "i",
                "j",
                "k",
                "l",
                "m",
                "n",
                "o",
                "p",
            ],
            "y": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
        },
        index=[
            1.0,
            2.0,
            3.0,
            4.0,
            5.0,
            6.0,
            7.0,
            8.0,
            9.0,
            10.0,
            11.0,
            12.0,
            13.0,
            14.0,
            15.0,
            16.0,
        ],
    )
    a = dd.from_pandas(df16, 16)

    # adding artifichial delays to make sure last tasks finish first
    # that's a way to simulate last tasks finishing last
    def delayed_nop(i):
        if i[1] < 10:
            sleep(0.1 * (10 - i[1]))
        return i

    # saving to multiple hdf nodes
    with tmpfile() as fn:
        a = a.apply(delayed_nop, axis=1, meta=a)
        a.to_hdf(fn, "/data*")
        out = dd.read_hdf(fn, "/data*")
        assert_eq(df16, out)

    # saving to multiple hdf files
    # adding artifichial delays to make sure last tasks finish first
    with tmpdir() as dn:
        fn = os.path.join(dn, "data*")
        a = a.apply(delayed_nop, axis=1, meta=a)
        a.to_hdf(fn, "/data")
        out = dd.read_hdf(fn, "/data")
        assert_eq(df16, out)
Пример #45
0
    def create_unigram_book_counts(self, newtable=True, ingest=True, index=True, reverse_index=True, table_count=1):
        import time
        t0 = time.time()

        db = self.db
        ngramname = "unigrams"
        tablenameroot = "master_bookcounts"
        # If you are splitting the input into multiple tables
        # to be joined as a merge table, come up with multiple 
        # table names and we'll cycle through.
        if table_count == 1:
            tablenames = [tablenameroot]
        elif table_count > 1:
            tablenames = ["%s_p%d" % (tablenameroot, i) for i in range(1, table_count+1)]
        else:
            logging.error("You need a positive integer for table_count")
            raise

        grampath =  ".bookworm/texts/encoded/%s" % ngramname
        tmpdir = "%s/tmp" % grampath

        if (len(grampath) == 0) or (grampath == "/"):
            logging.error("Woah! Don't set the ngram path to your system root!")
            raise
        
        if newtable:
            if os.path.exists(tmpdir):
                import shutil
                shutil.rmtree(tmpdir)
        
            logging.info("Dropping older %s table, if it exists" % ngramname)
            for tablename in tablenames:
                db.query("DROP TABLE IF EXISTS " + tablename)

        logging.info("Making a SQL table to hold the %s" % ngramname)
        reverse_index_sql = "INDEX(bookid,wordid,count), " if reverse_index else ""
        for tablename in tablenames:
            db.query("CREATE TABLE IF NOT EXISTS " + tablename + " ("
                "bookid MEDIUMINT UNSIGNED NOT NULL, " + reverse_index_sql +
                "wordid MEDIUMINT UNSIGNED NOT NULL, INDEX(wordid,bookid,count), "
                "count MEDIUMINT UNSIGNED NOT NULL);")

        if ingest:
            for tablename in tablenames:
                db.query("ALTER TABLE " + tablename + " DISABLE KEYS")
            db.query("set NAMES utf8;")
            db.query("set CHARACTER SET utf8;")
            logging.info("loading data using LOAD DATA LOCAL INFILE")
            
            files = os.listdir(grampath)
            for i, filename in enumerate(files):
                if filename.endswith('.txt'):
                    # With each input file, cycle through each table in tablenames
                    tablename = tablenames[i % len(tablenames)]
                    logging.debug("Importing txt file, %s (%d/%d)" % (filename, i, len(files)))
                    try:
                        db.query("LOAD DATA LOCAL INFILE '" + grampath + "/" + filename + "' INTO TABLE " + tablename +" CHARACTER SET utf8 (bookid,wordid,count);")
                    except KeyboardInterrupt:
                        raise
                    except:
                       logging.debug("Falling back on insert without LOCAL DATA INFILE. Slower.")
                       try:
                            import pandas as pd
                            df = pd.read_csv(grampath + "/" + filename, sep='\t', header=None)
                            to_insert = df.apply(tuple, axis=1).tolist()
                            db.query(
                                "INSERT INTO " + tablename + " (bookid,wordid,count) "
                                "VALUES (%s, %s, %s);""",
                                many_params=to_insert
                                )
                       except KeyboardInterrupt:
                           raise
                       except:
                           logging.exception("Error inserting %s from %s" % (ngramname, filename))
                           continue

                elif filename.endswith('.h5'):
                    logging.info("Importing h5 file, %s (%d/%d)" % (filename, i, len(files)))
                    try:
                        # When encountering an .h5 file, this looks for ngram information
                        # in a /#{ngramnames} table (e.g. /unigrams) and writes it out to
                        # temporary TSV files.
                        # Dask is used here simply because it's a dead simple way to multithread
                        # the TSV writing and lower the overhead versus having a TSV already staged.
                        import csv
                        import pandas as pd
                        try:
                            import dask.dataframe as dd
                        except:
                            logging.exception("Ingesting h5 files requires dask")
                        try:
                            os.makedirs(tmpdir)
                        except OSError:
                            if not os.path.isdir(tmpdir):
                                raise
                        # Dask will use #{n_cores-1} threads when saving CSVs.
                        # Ingest and key reload times are identical to txt import, so the only
                        # additional overhead is reading the file (small effect) and writing the csv.
                        ddf = dd.read_hdf(grampath + "/" + filename,
                                          ngramname, mode='r', chunksize=2000000)
                        ddf.reset_index().to_csv(tmpdir + '/tmp.*.tsv',
                                                 index=False, sep='\t', header=False,
                                                 quoting=csv.QUOTE_NONNUMERIC)
                        logging.info("CSV written from H5. Time passed: %.2f s" % (time.time() - t0))
                        for j, tmpfile in enumerate(os.listdir(tmpdir)):
                            # With each input file, cycle through each table in tablenames
                            tablename = tablenames[j % len(tablenames)]
                            path = "%s/%s" % (tmpdir, tmpfile)
                            db.query("LOAD DATA LOCAL INFILE '" + path + "' "
                                     "INTO TABLE " + tablename + " "
                                     "CHARACTER SET utf8 (bookid,wordid,count);")
                            try:
                                os.remove(path)
                            except:
                                pass
                        logging.info("CSVs input. Time passed: %.2f s" % (time.time() - t0))
                    except KeyboardInterrupt:
                       raise
                    except:
                       logging.exception("Error inserting %s from %s" % (ngramname, filename))
                       continue
                else:
                    continue
        if index:
            logging.info("Creating Unigram Indexes. Time passed: %.2f s" % (time.time() - t0))
            for tablename in tablenames:
                db.query("ALTER TABLE " + tablename + " ENABLE KEYS")

            if table_count > 1:
                logging.info("Creating a merge table for " + ",".join(tablenames))
                db.query("CREATE TABLE IF NOT EXISTS " + tablenameroot + " ("
                    "bookid MEDIUMINT UNSIGNED NOT NULL, " + reverse_index_sql +
                    "wordid MEDIUMINT UNSIGNED NOT NULL, INDEX(wordid,bookid,count), "
                    "count MEDIUMINT UNSIGNED NOT NULL) "
                    "ENGINE=MERGE UNION=(" + ",".join(tablenames) + ") INSERT_METHOD=LAST;")

        logging.info("Unigram index created in: %.2f s" % ((time.time() - t0)))
Пример #46
0
def test_to_hdf_multiple_nodes():
    pytest.importorskip("tables")
    df = pd.DataFrame({
        "x": ["a", "b", "c", "d"],
        "y": [1, 2, 3, 4]
    },
                      index=[1.0, 2.0, 3.0, 4.0])
    a = dd.from_pandas(df, 2)
    df16 = pd.DataFrame(
        {
            "x": [
                "a",
                "b",
                "c",
                "d",
                "e",
                "f",
                "g",
                "h",
                "i",
                "j",
                "k",
                "l",
                "m",
                "n",
                "o",
                "p",
            ],
            "y": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
        },
        index=[
            1.0,
            2.0,
            3.0,
            4.0,
            5.0,
            6.0,
            7.0,
            8.0,
            9.0,
            10.0,
            11.0,
            12.0,
            13.0,
            14.0,
            15.0,
            16.0,
        ],
    )
    b = dd.from_pandas(df16, 16)

    # saving to multiple nodes
    with tmpfile("h5") as fn:
        a.to_hdf(fn, "/data*")
        out = dd.read_hdf(fn, "/data*")
        assert_eq(df, out)

    # saving to multiple nodes making sure order is kept
    with tmpfile("h5") as fn:
        b.to_hdf(fn, "/data*")
        out = dd.read_hdf(fn, "/data*")
        assert_eq(df16, out)

    # saving to multiple datasets with custom name_function
    with tmpfile("h5") as fn:
        a.to_hdf(fn, "/data_*", name_function=lambda i: "a" * (i + 1))
        out = dd.read_hdf(fn, "/data_*")
        assert_eq(df, out)

        out = pd.read_hdf(fn, "/data_a")
        tm.assert_frame_equal(out, df.iloc[:2])
        out = pd.read_hdf(fn, "/data_aa")
        tm.assert_frame_equal(out, df.iloc[2:])

    # test multiple nodes with hdf object
    with tmpfile("h5") as fn:
        with pd.HDFStore(fn) as hdf:
            b.to_hdf(hdf, "/data*")
            out = dd.read_hdf(fn, "/data*")
            assert_eq(df16, out)
Пример #47
0
import icae.tools.plot_data

from icae.tools import EMD
from icae.tools import nn
from icae.tools import status_report
from icae.tools import AE_training
from icae.tools import AE_single as AEs_tools
from icae.tools import data_loader
from icae.models import single_event as AE_models
from icae.tools.config_loader import config

# -

# FIXME: deprecated references (data, optimal_NB). Does 02c-single-waveform-torch.py replace this?

df = dd.read_hdf("../" + config.data.retabled + "*.hdf",
                 key=config.data.hdf_key)

model, encoder = AE_models.optimal_NB(3, loss_method=loss_method)
hist = AE_training.train(model, data, verbose=1, epochs=6, batch_size=1000)
status_report.init(model, "BN 1k-bx100e new preprocessor", "-")
status_report.save_plot("loss")

# data = AE_lib.preprocess(AE_lib.load_mc())
AE_training.plot_results(model, data[:100000])
status_report.save_plot("overview-no-translation")

inlier, outlier = AEs_tools.seperate_outliers(model, data)
status_report.save_plot("outlier-seperation")
status_report.save_obj({
    "inlier indices": inlier,
    "outlier indices": outlier
Пример #48
0
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler, Imputer
from sklearn.model_selection import LeaveOneGroupOut, cross_val_predict
from zillow import modelling
import pickle as pkl
import dask.dataframe as dd

train_df = pd.concat([
    #dd.read_hdf("input/train2.*.hdf", "data").compute(),
    dd.read_hdf("input/train_20172.*.hdf", "data").compute(),
])
month = train_df["yearmonth"]

train_y = train_df['logerror']

tolerance = 100
y = np.clip(train_y,
            np.median(train_y)-tolerance,
            np.median(train_y)+tolerance)

cv = LeaveOneGroupOut()

train_df = pd.concat([
    pd.read_csv("stack_stage1_{}.csv".format(i), index_col=0)
    for i in [1,2,3]
], axis=1)

last_month = month.max()
filt = (month == last_month)
Пример #49
0
    columns = [f'c{i}' for i in range(table_size[0])]
    df = pd.DataFrame(np.random.rand(*table_size),columns=columns,index=[index]*table_size[0])
    return df

data_file = config.root + config.data.retabled_single
# %%
#f = pd.HDFStore(data_file,'r')
f_raw = h5py.File(data_file, 'w')

# %%
store = f.get_storer('frame')
# %%
type(store)

# %%
ddf = dd.read_hdf(data_file,'frame')

# %%
ddf['x'].max().compute()
# %%
# def maximum(f):
chunksize = 1000000
frame = 0
col = 'x'
maximum = 0

# %%
while True:
    try:
        df = f.select('frame',f"frame>={frame} & frame < {frame+chunksize}")
        maximum = max([maximum,df[col].max()])