Exemplo n.º 1
0
def test_time_series_default_values():
    n_channels = 3
    t, m, e = sample_time_series(channels=n_channels)
    ts = TimeSeries(None, m[0], None)
    npt.assert_allclose(
        ts.time, np.linspace(0., time_series.DEFAULT_MAX_TIME, m.shape[1]))
    npt.assert_allclose(ts.error,
                        np.repeat(time_series.DEFAULT_ERROR_VALUE, m.shape[1]))
    assert ts.n_channels == 1

    ts = TimeSeries(None, m, None)
    npt.assert_allclose(
        ts.time[0], np.linspace(0., time_series.DEFAULT_MAX_TIME, m.shape[1]))
    npt.assert_allclose(ts.error[0],
                        np.repeat(time_series.DEFAULT_ERROR_VALUE, m.shape[1]))
    assert ts.n_channels == n_channels

    t = [t[i][0:i + 2] for i in range(len(t))]
    m = [m[i][0:i + 2] for i in range(len(m))]
    e = [e[i][0:i + 2] for i in range(len(e))]
    ts = TimeSeries(None, m, None)
    for i in range(n_channels):
        npt.assert_allclose(
            ts.time[i], np.linspace(0., time_series.DEFAULT_MAX_TIME,
                                    len(m[i])))
        npt.assert_allclose(
            ts.error[i], np.repeat(time_series.DEFAULT_ERROR_VALUE, len(m[i])))
    assert ts.n_channels == n_channels
Exemplo n.º 2
0
def test_transform_ts_files():
    n_class1 = 4
    n_class2 = 8
    transform_type = "Train/Test Split"
    time_series = [TimeSeries(*sample_values(), target='class1')
                   for i in range(n_class1)]
    time_series += [TimeSeries(*sample_values(), target='class2')
                    for i in range(n_class2)]
    output = transformation.transform_ts_files(time_series, transform_type)
    npt.assert_equal(len(output), 2)
Exemplo n.º 3
0
def test_train_test_split_ratios():
    n_class1 = 4
    n_class2 = 8
    transform_type = "Train/Test Split"
    time_series = [TimeSeries(*sample_values(), target='class1')
                   for i in range(n_class1)]
    time_series += [TimeSeries(*sample_values(), target='class2')
                    for i in range(n_class2)]
    outputs = transformation.train_test_split(
        time_series, test_size=0.5, train_size=0.5)
    npt.assert_equal(len(outputs[1]), len(time_series) / 2)
    npt.assert_equal(len(outputs[0]), len(time_series) / 2)
Exemplo n.º 4
0
def sample_ts_files(size, targets=[None]):
    temp_dir = tempfile.mkdtemp()
    paths = []
    for target in islice(cycle(targets), size):
        t, m, e = sample_values()
        name = str(uuid.uuid4())
        path = pjoin(temp_dir, '{}.nc'.format(name))
        ts = TimeSeries(t, m, e, target=target, path=path, name=name)
        ts.to_netcdf(path)
        paths.append(path)

    yield paths

    shutil.rmtree(temp_dir)
Exemplo n.º 5
0
def sample_ts_files(size, labels=[None]):
    temp_dir = tempfile.mkdtemp()
    paths = []
    for label in islice(cycle(labels), size):
        t, m, e = sample_values()
        name = str(uuid.uuid4())
        path = pjoin(temp_dir, '{}.npz'.format(name))
        ts = TimeSeries(t, m, e, label=label, path=path, name=name)
        ts.save(path)
        paths.append(path)

    yield paths

    shutil.rmtree(temp_dir)
Exemplo n.º 6
0
def test_time_series_init_2d():
    n_channels = 3
    t, m, e = sample_time_series(channels=n_channels)
    ts = TimeSeries(t, m, e)
    assert ts.time.shape == t.shape and np.allclose(ts.time, t)
    assert ts.measurement.shape == m.shape and np.allclose(ts.measurement, m)
    assert ts.error.shape == e.shape and np.allclose(ts.error, e)
    assert ts.n_channels == n_channels

    ts = TimeSeries(t[0], m, e[0])
    assert ts.time.shape == m.shape and np.allclose(ts.time[0], t[0])
    assert ts.measurement.shape == m.shape and np.allclose(ts.measurement, m)
    assert ts.error.shape == m.shape and np.allclose(ts.error[0], e[0])
    assert ts.n_channels == n_channels
Exemplo n.º 7
0
def sample_ts_files(size, targets=[None]):
    temp_dir = tempfile.mkdtemp()
    paths = []
    for target in islice(cycle(targets), size):
        t, m, e = sample_values()
        name = str(uuid.uuid4())
        path = pjoin(temp_dir, '{}.nc'.format(name))
        ts = TimeSeries(t, m, e, target=target, path=path, name=name)
        ts.to_netcdf(path)
        paths.append(path)

    yield paths

    shutil.rmtree(temp_dir)
Exemplo n.º 8
0
def sample_ts_files(size, labels=[None]):
    temp_dir = tempfile.mkdtemp()
    paths = []
    for label in islice(cycle(labels), size):
        t, m, e = sample_values()
        name = str(uuid.uuid4())
        path = pjoin(temp_dir, '{}.npz'.format(name))
        ts = TimeSeries(t, m, e, label=label, path=path, name=name)
        ts.save(path)
        paths.append(path)

    yield paths

    shutil.rmtree(temp_dir)
Exemplo n.º 9
0
def test_time_series_init_1d():
    t, m, e = sample_time_series(channels=1)
    ts = TimeSeries(t, m, e)
    assert ts.time.shape == t.shape and np.allclose(ts.time, t)
    assert ts.measurement.shape == m.shape and np.allclose(ts.measurement, m)
    assert ts.error.shape == e.shape and np.allclose(ts.error, e)
    assert ts.n_channels == 1
Exemplo n.º 10
0
def test_train_test_split():
    # Mock out unevenly-labeled test data: 4 class1, 8 class2
    n_class1 = 4
    n_class2 = 8
    transform_type = "Train/Test Split"
    time_series = [TimeSeries(*sample_values(), target='class1')
                   for i in range(n_class1)]
    time_series += [TimeSeries(*sample_values(), target='class2')
                    for i in range(n_class2)]
    np.random.seed(0)
    train, test = transformation.transform_ts_files(time_series,
                                                    transform_type)
    npt.assert_equal(
        sum(ts.target == 'class1' for ts in train), 1 * n_class1 / 2)
    npt.assert_equal(sum(ts.target == 'class1' for ts in test), n_class1 / 2)
    npt.assert_equal(
        sum(ts.target == 'class2' for ts in train), 1 * n_class2 / 2)
    npt.assert_equal(sum(ts.target == 'class2' for ts in test), n_class2 / 2)
Exemplo n.º 11
0
def test_time_series_init_ragged():
    n_channels = 3
    t, m, e = sample_time_series(channels=n_channels)
    t = [t[i][0:i + 2] for i in range(len(t))]
    m = [m[i][0:i + 2] for i in range(len(m))]
    e = [e[i][0:i + 2] for i in range(len(e))]
    ts = TimeSeries(t, m, e)
    assert all(np.allclose(ts.time[i], t[i]) for i in range(len(t)))
    assert all(np.allclose(ts.measurement[i], m[i]) for i in range(len(t)))
    assert all(np.allclose(ts.error[i], e[i]) for i in range(len(t)))
    assert ts.n_channels == n_channels
Exemplo n.º 12
0
def to_cesium_dataset(X):
    """Transform a tslearn-compatible dataset into a cesium dataset.

    Parameters
    ----------
    X: array, shape = (n_ts, sz, d), where n_ts=1
        tslearn-formatted dataset to be cast to cesium format

    Returns
    -------
    list of cesium TimeSeries
        cesium-formatted dataset (cf.
        `link <http://cesium-ml.org/docs/api/cesium.time_series.html#cesium.time_series.TimeSeries>`_)

    Examples
    --------
    >>> tslearn_arr = numpy.random.randn(3, 16, 1)
    >>> cesium_ds = to_cesium_dataset(tslearn_arr)
    >>> len(cesium_ds)
    3
    >>> cesium_ds[0].measurement.shape
    (16,)
    >>> tslearn_arr = numpy.random.randn(3, 16, 2)
    >>> cesium_ds = to_cesium_dataset(tslearn_arr)
    >>> len(cesium_ds)
    3
    >>> cesium_ds[0].measurement.shape
    (2, 16)
    >>> tslearn_arr = [[1, 2, 3], [1, 2, 3, 4]]
    >>> cesium_ds = to_cesium_dataset(tslearn_arr)
    >>> len(cesium_ds)
    2
    >>> cesium_ds[0].measurement.shape
    (3,)

    Notes
    -----
    Conversion from/to cesium format requires cesium to be installed.
    """  # noqa: E501
    try:
        from cesium.time_series import TimeSeries
    except ImportError:
        raise ImportError("Conversion from/to cesium cannot be performed "
                          "if cesium is not installed.")

    def transpose_or_flatten(ts):
        ts_ = ts[:ts_size(ts)]
        if ts.shape[1] == 1:
            return ts_.reshape((-1, ))
        else:
            return ts_.transpose()

    X_ = check_dataset(X)
    return [TimeSeries(m=transpose_or_flatten(Xi)) for Xi in X_]
Exemplo n.º 13
0
def test_time_series_sort():
    t, m, e = sample_time_series(channels=1)
    t[:2] = t[1::-1]
    ts = TimeSeries(t, m, e)
    npt.assert_allclose(ts.time, np.sort(t))
    npt.assert_allclose(ts.measurement, m[np.argsort(t)])
    npt.assert_allclose(ts.error, e[np.argsort(t)])

    n_channels = 3
    t, m, e = sample_time_series(channels=n_channels)
    t[:, :2] = t[:, 1::-1]
    ts = TimeSeries(t, m, e)
    for i in range(len(m)):
        npt.assert_allclose(ts.time[i], np.sort(t[i]))
        npt.assert_allclose(ts.measurement[i], m[i][np.argsort(t[i])])
        npt.assert_allclose(ts.error[i], e[i][np.argsort(t[i])])

    ts = TimeSeries(t[0], m, e[0])
    for i in range(len(m)):
        npt.assert_allclose(ts.time[i], np.sort(t[0]))
        npt.assert_allclose(ts.measurement[i], m[i][np.argsort(t[0])])
        npt.assert_allclose(ts.error[i], e[0][np.argsort(t[0])])
Exemplo n.º 14
0
def test_time_series_netCDF():
    n_channels = 3
    t, m, e = sample_time_series(channels=n_channels)
    ts = TimeSeries(t[0], m[0], e[0])
    ts.to_netcdf(TEST_TS_PATH)
    ts_nc = time_series.from_netcdf(TEST_TS_PATH)
    assert_ts_equal(ts, ts_nc)

    ts = TimeSeries(t[0], m, e[0])
    ts.to_netcdf(TEST_TS_PATH)
    ts_nc = time_series.from_netcdf(TEST_TS_PATH)
    assert_ts_equal(ts, ts_nc)

    t = [t[i][0:i+2] for i in range(len(t))]
    m = [m[i][0:i+2] for i in range(len(m))]
    e = [e[i][0:i+2] for i in range(len(e))]
    ts = TimeSeries(t, m, e)
    ts.to_netcdf(TEST_TS_PATH)
    ts_nc = time_series.from_netcdf(TEST_TS_PATH)
    assert_ts_equal(ts, ts_nc)
Exemplo n.º 15
0
def test_channels_iterator():
    n_channels = 3
    t, m, e = sample_time_series(channels=n_channels)
    ts = TimeSeries(t[0], m[0], e[0])
    for t_i, m_i, e_i in ts.channels():
        npt.assert_allclose(t_i, t[0])
        npt.assert_allclose(m_i, m[0])
        npt.assert_allclose(e_i, e[0])

    ts = TimeSeries(t, m, e)
    for (t_i, m_i, e_i), i in zip(ts.channels(), range(n_channels)):
        npt.assert_allclose(t_i, t[i])
        npt.assert_allclose(m_i, m[i])
        npt.assert_allclose(e_i, e[i])

    t = [t[i][0:i + 2] for i in range(len(t))]
    m = [m[i][0:i + 2] for i in range(len(m))]
    e = [e[i][0:i + 2] for i in range(len(e))]
    ts = TimeSeries(t, m, e)
    for (t_i, m_i, e_i), i in zip(ts.channels(), range(n_channels)):
        npt.assert_allclose(t_i, t[i])
        npt.assert_allclose(m_i, m[i])
        npt.assert_allclose(e_i, e[i])
Exemplo n.º 16
0
def test_time_series_npz(tmpdir):
    n_channels = 3
    t, m, e = sample_time_series(channels=n_channels)

    ts = TimeSeries(t[0], m[0], e[0])
    ts_path = os.path.join(str(tmpdir), str(uuid4()) + '.npz')
    ts.save(ts_path)
    ts_loaded = time_series.load(ts_path)
    assert_ts_equal(ts, ts_loaded)

    ts = TimeSeries(t[0], m, e[0])
    ts_path = os.path.join(str(tmpdir), str(uuid4()) + '.npz')
    ts.save(ts_path)
    ts_loaded = time_series.load(ts_path)
    assert_ts_equal(ts, ts_loaded)

    t = [t[i][0:i+2] for i in range(len(t))]
    m = [m[i][0:i+2] for i in range(len(m))]
    e = [e[i][0:i+2] for i in range(len(e))]
    ts = TimeSeries(t, m, e)
    ts_path = os.path.join(str(tmpdir), str(uuid4()) + '.npz')
    ts.save(ts_path)
    ts_loaded = time_series.load(ts_path)
    assert_ts_equal(ts, ts_loaded)
Exemplo n.º 17
0
def get_ts_obj(object_id, meta_data, ts_data, pbmap):
    pbnames = list(pbmap.values())
    row = meta_data.query('object_id == @object_id')
    if 'target' in meta_data.columns:
        target = row['target']
    else:
        target = None

    extract_ts = ts_data.query('object_id == @object_id')
    pbind = [(extract_ts['passband'] == pb) for pb in pbmap]
    t = [extract_ts['mjd'][mask].values for mask in pbind]
    m = [extract_ts['flux'][mask].values for mask in pbind]
    e = [extract_ts['flux_err'][mask].values for mask in pbind]

    ts_obj = TimeSeries(t=t,
                        m=m,
                        e=e,
                        label=target,
                        name=object_id,
                        channel_names=pbnames)

    return (object_id, ts_obj)
Exemplo n.º 18
0
def read_lightcurves():
    # Load the light curve data from csv file to an Astropy table
    lcfilename = f'{datadir}/training_set.csv'
    lcdata = Table.read(lcfilename, format='csv')

    # Now we make a cesium Timeseries object for each light curve
    tsdict = OrderedDict()
    for i in trange(nobjects, desc="Building Timeseries"):
        row = metadata[i]
        thisid = row['object_id']
        target = row['target']

        meta = {
            'z': row['hostgal_photoz'],
            'zerr': row['hostgal_photoz_err'],
            'mwebv': row['mwebv']
        }

        ind = (lcdata['object_id'] == thisid)
        thislc = lcdata[ind]

        pbind = [(thislc['passband'] == pb) for pb in pbmap]
        t = [thislc['mjd'][mask].data for mask in pbind]
        m = [thislc['flux'][mask].data for mask in pbind]
        e = [thislc['flux_err'][mask].data for mask in pbind]

        tsdict[thisid] = TimeSeries(t=t,
                                    m=m,
                                    e=e,
                                    label=target,
                                    name=thisid,
                                    meta_features=meta,
                                    channel_names=pbnames)

    del lcdata
    return tsdict
Exemplo n.º 19
0
def test_channels_iterator():
    n_channels = 3
    t, m, e = sample_time_series(channels=n_channels)
    ts = TimeSeries(t[0], m[0], e[0])
    for t_i, m_i, e_i in ts.channels():
        npt.assert_allclose(t_i, t[0])
        npt.assert_allclose(m_i, m[0])
        npt.assert_allclose(e_i, e[0])

    ts = TimeSeries(t, m, e)
    for (t_i, m_i, e_i), i in zip(ts.channels(), range(n_channels)):
        npt.assert_allclose(t_i, t[i])
        npt.assert_allclose(m_i, m[i])
        npt.assert_allclose(e_i, e[i])

    t = [t[i][0:i+2] for i in range(len(t))]
    m = [m[i][0:i+2] for i in range(len(m))]
    e = [e[i][0:i+2] for i in range(len(e))]
    ts = TimeSeries(t, m, e)
    for (t_i, m_i, e_i), i in zip(ts.channels(), range(n_channels)):
        npt.assert_allclose(t_i, t[i])
        npt.assert_allclose(m_i, m[i])
        npt.assert_allclose(e_i, e[i])
Exemplo n.º 20
0
def featurize_data(pbmap, pbnames, lcdata, metadata, nobjects, featurefile):
    """
    ***Feature extractor  for PLaSTiCC***

    Extracts features from data by some Cesium library functions.
    Builds a timeseries dictionary and for each time series extracts
    features. Features described in file: feature_sets.
    
    Created on Mon Apr 29 19:30:52 2019
    
    @author: luisarribas
    
    """
    print("")
    print("EXTRACTING FEATURES")
    print("===================")
    print("")
    print("Building Timeseries....wait")
    print("===========================")
    #**********************BUILD TIME SERIES**********************************
    tsdict = OrderedDict()
    for i in range(nobjects):
        row = metadata[i]
        thisid = row['object_id']
        target = row['target']

        meta = {'zBand':row['zBand'],\
                'z':row['hostgal_photoz'],\
                'zerr':row['hostgal_photoz_err'],\
                'mag':row['magnitude'],\
                'u-b':row['u-b'],\
                'b-v':row['b-v']
                }

        ind = (lcdata['object_id'] == thisid)
        thislc = lcdata[ind]

        pbind = [(thislc['passband'] == pb) for pb in pbmap]
        t = [thislc['mjd'][mask].data for mask in pbind]
        m = [thislc['flux'][mask].data for mask in pbind]
        e = [thislc['flux_err'][mask].data for mask in pbind]

        tsdict[thisid] = TimeSeries(t=t, m=m, e=e,\
                            label=target, name=thisid, meta_features=meta,\
                            channel_names=pbnames )

    print("")
    print("OK!")
    print(" ")

    #***********************FEATURE EXTRACTION WITH CESIUM********************
    warnings.simplefilter('ignore')
    if os.path.exists(featurefile):
        print("")
        print("Loading features from file....wait")
        print("==================================")
        featuretable, _ = featurize.load_featureset(featurefile)
        print("")
        print("OK!")
        print(" ")
    else:
        features_list = []
        print("")
        print("Computing features....wait")
        print("==========================")

        with schwimmbad.MultiPool() as pool:
            results = pool.imap(worker, list(tsdict.values()))
            for res in results:
                features_list.append(res)

        featuretable = featurize.assemble_featureset(features_list=features_list,\
                                  time_series=tsdict.values())
        featurize.impute_featureset(fset=featuretable,
                                    strategy='constant',
                                    value=0,
                                    max_value=18446744073709551000,
                                    inplace=True)
        featurize.save_featureset(fset=featuretable, path=featurefile)
        print("")
        print("OK!")
        print(" ")

    #*******Build Pandas dataframe output*************************************
    old_names = featuretable.columns.values
    new_names = ['{}_{}'.format(x, pbmap.get(y, 'meta')) for x, y in old_names]
    cols = [featuretable[col] for col in old_names]
    allfeats = Table(cols, names=new_names, masked=False)
    allfeats['target'] = metadata['target']
    allfeats = allfeats.to_pandas()
    allfeats = np.nan_to_num(allfeats)
    new_names.append('target')
    allfeats = Table(allfeats, names=new_names, masked=False)
    allfeats = allfeats.to_pandas()
    print("")
    print("Extracted features = ", len(allfeats.columns))
    print("==========================")
    print("")
    print("Nan Values detected = ", sum(len(allfeats) - allfeats.count()))
    print("==========================")

    return allfeats
Exemplo n.º 21
0
    #where the length equals how many total observations that object has in the time series data
    pbind = [np.asarray(thislc['passband'] == pb) for pb in pbmap]
    
    #Store the times of all the measurements for this object_id. this will be a list with length 6 (one for
    # each passband) where each of the 6 items is a list of length = how many observations that object
    #has for JUST THAT PASSBAND in the time series data
    t = [np.asarray(thislc.loc[:,'mjd'][mask]) for mask in pbind]  
    
    #Store the flux measurements taken for this object_id. This is the same shape as t
    m = [np.asarray(thislc['flux'][mask]) for mask in pbind]
    
    #Store the flux measurement errors for this object_id. This is the same shape as t
    e = [np.asarray(thislc['flux_err'][mask]) for mask in pbind]
    
    #Create the TimeSeries() object using the variables defined above, and append it to the ordered dict.
    tsdict[thisid] = TimeSeries(t=t, m=m, e=e, name=thisid, meta_features=meta,channel_names=pbnames)
    
    #Occasionally print progress so we can assess speed
    if i % 1000000 == 0:
        print(str(i) + ' done out of ' + str(nobjects))

    
#Empty list for the actual features we want the TimeSeries() object to facilitate
features_list = []
print("Generating features from objects and storing to a table...")

with multiprocessing.Pool() as pool:  
    #Apply the worker function to each object in the dict of TimeSeries() objects. see ./testWorker.py
    #this returns a single row for each object with hundreds of features determined by ./testWorker.py
    results = pool.imap(worker, list(tsdict.values()))
    i = 0
Exemplo n.º 22
0
def test_time_series_npz():
    n_channels = 3
    t, m, e = sample_time_series(channels=n_channels)
    ts = TimeSeries(t[0], m[0], e[0])
    ts.save(TEST_TS_PATH)
    ts_loaded = time_series.load(TEST_TS_PATH)
    assert_ts_equal(ts, ts_loaded)

    ts = TimeSeries(t[0], m, e[0])
    ts.save(TEST_TS_PATH)
    ts_loaded = time_series.load(TEST_TS_PATH)
    assert_ts_equal(ts, ts_loaded)

    t = [t[i][0:i+2] for i in range(len(t))]
    m = [m[i][0:i+2] for i in range(len(m))]
    e = [e[i][0:i+2] for i in range(len(e))]
    ts = TimeSeries(t, m, e)
    ts.save(TEST_TS_PATH)
    ts_loaded = time_series.load(TEST_TS_PATH)
    assert_ts_equal(ts, ts_loaded)
Exemplo n.º 23
0
def test_time_series_npz(tmpdir):
    n_channels = 3
    t, m, e = sample_time_series(channels=n_channels)

    ts = TimeSeries(t[0], m[0], e[0])
    ts_path = os.path.join(str(tmpdir), str(uuid4()) + '.npz')
    ts.save(ts_path)
    ts_loaded = time_series.load(ts_path)
    assert_ts_equal(ts, ts_loaded)

    ts = TimeSeries(t[0], m, e[0])
    ts_path = os.path.join(str(tmpdir), str(uuid4()) + '.npz')
    ts.save(ts_path)
    ts_loaded = time_series.load(ts_path)
    assert_ts_equal(ts, ts_loaded)

    t = [t[i][0:i + 2] for i in range(len(t))]
    m = [m[i][0:i + 2] for i in range(len(m))]
    e = [e[i][0:i + 2] for i in range(len(e))]
    ts = TimeSeries(t, m, e)
    ts_path = os.path.join(str(tmpdir), str(uuid4()) + '.npz')
    ts.save(ts_path)
    ts_loaded = time_series.load(ts_path)
    assert_ts_equal(ts, ts_loaded)