def test_timepoint_to_rowid(): rng = np.random.RandomState(2) trcrpm = TRCRP_Mixture(chains=1, lag=0, variables=FRAME.columns, rng=rng) trcrpm.incorporate(FRAME) for i in xrange(len(FRAME)): assert trcrpm._timepoint_to_rowid(i) == i assert trcrpm._timepoint_to_rowid(i) == i
def test_get_dependence_probabilities_crash(): rng = np.random.RandomState(1) trcrpm = TRCRP_Mixture(chains=4, lag=3, variables=FRAME.columns, rng=rng) trcrpm.incorporate(FRAME) nvars = len(trcrpm.variables) dependencies = trcrpm.dependence_probability_pairwise() assert np.allclose(dependencies, np.ones((nvars, nvars)))
def test_serialize_trcrp_mixture(): rng = np.random.RandomState(1) trcrpm = TRCRP_Mixture(chains=4, lag=3, variables=FRAME.columns, rng=rng) trcrpm.incorporate(FRAME) metadata = trcrpm.to_metadata() modulename, attributename = metadata['factory'] module = importlib.import_module(modulename) builder = getattr(module, attributename) trcrpm2 = builder.from_metadata(metadata, seed=1) assert isinstance(trcrpm2, TRCRP_Mixture)
def test_incorporate_sampleid_wedged(): rng = np.random.RandomState(1) trcrpm = TRCRP_Mixture(chains=1, lag=2, variables=FRAME.columns, rng=rng) trcrpm.incorporate(FRAME) # Now incorporate an non-contiguous row skipping last_sampled_id+1. last_sampled_id = max(FRAME.index) frame_new = pd.DataFrame([[11, 12, float('nan')]], columns=['a', 'b', 'c'], index=[last_sampled_id + 2]) trcrpm.incorporate(frame_new) # Now bring back the missing observation at last_sampled_id+1. # XXX The xfailure happens in this step. frame_new = pd.DataFrame([[3, 3, 3]], columns=['a', 'b', 'c'], index=[last_sampled_id + 1]) trcrpm.incorporate(frame_new) tabulated_data = trcrpm.engine.states[0].data_array() expected_data = [ [nan, nan, 86., nan, nan, 57., nan, nan, 65.], [nan, 86., 19., nan, 57., 62., nan, 65., nan], [86., 19., 17., 57., 62., 41., 65., nan, 17.], [19., 17., nan, 62., 41., 7., nan, 17., 30.], [17., nan, 75., 41., 7., 1., 17., 30., 12.], [nan, 75., 45., 7., 1., nan, 30., 12., 72.], [75., 45., 48., 1., nan, 77., 12., 72., 8.], [45., 48., 29., nan, 77., nan, 72., 8., 86.], [48., 29., 83., 77., nan, 46., 8., 86., 38.], [29., 83., nan, nan, 46., 54., 86., 38., nan], [nan, 3., 11., 54., 3., 12., nan, 3., nan], # row updated. [83., nan, 3., 46., 54., 3., 38., nan, 3.], # new row. ] assert np.allclose(tabulated_data, expected_data, equal_nan=True)
def test_tabulate_lagged_data(): rng = np.random.RandomState(2) trcrpm = TRCRP_Mixture(chains=1, lag=0, variables=FRAME.columns, rng=rng) trcrpm.incorporate(FRAME) tabulated_data = trcrpm.engine.states[0].data_array() expected_data = DATA_RAW assert np.allclose(tabulated_data, DATA_RAW, equal_nan=True) trcrpm = TRCRP_Mixture(chains=1, lag=2, variables=FRAME.columns, rng=rng) trcrpm.incorporate(FRAME) tabulated_data = trcrpm.engine.states[0].data_array() expected_data = [ [nan, nan, 86., nan, nan, 57., nan, nan, 65.], [nan, 86., 19., nan, 57., 62., nan, 65., nan], [86., 19., 17., 57., 62., 41., 65., nan, 17.], [19., 17., nan, 62., 41., 7., nan, 17., 30.], [17., nan, 75., 41., 7., 1., 17., 30., 12.], [nan, 75., 45., 7., 1., nan, 30., 12., 72.], [75., 45., 48., 1., nan, 77., 12., 72., 8.], [45., 48., 29., nan, 77., nan, 72., 8., 86.], [48., 29., 83., 77., nan, 46., 8., 86., 38.], [29., 83., nan, nan, 46., 54., 86., 38., nan], ] assert np.allclose(tabulated_data, expected_data, equal_nan=True)
def test_simulate_dimensions(): rng = np.random.RandomState(1) trcrpm = TRCRP_Mixture(chains=4, lag=3, variables=FRAME.columns, rng=rng) trcrpm.incorporate(FRAME) def check_dims_correct(timepoints, variables, nsamples, simulator): samples = simulator(timepoints, variables, nsamples, multiprocess=0) # Number of chains is 4. assert len(samples) == nsamples * 4 for sample in samples: assert len(sample) == len(timepoints) for subsample in sample: assert len(subsample) == len(variables) for simulator in [trcrpm.simulate, trcrpm.simulate_ancestral]: check_dims_correct([1], ['a'], 4, simulator) check_dims_correct([1], ['a', 'b'], 7, simulator) check_dims_correct([1, 2], ['a'], 1, simulator) check_dims_correct([1, 2, 3], ['c', 'b'], 10, simulator) check_dims_correct([9, 10, 11, 12], ['c', 'b'], 10, simulator)
def test_incorporate_sampleid_non_contiguous(): rng = np.random.RandomState(1) trcrpm = TRCRP_Mixture(chains=1, lag=2, variables=FRAME.columns, rng=rng) trcrpm.incorporate(FRAME) # Now incorporate an non-contiguous row skipping last_sampled_id+1. last_sampled_id = max(FRAME.index) frame_new = pd.DataFrame([[11, 12, float('nan')]], columns=['a', 'b', 'c'], index=[last_sampled_id + 2]) trcrpm.incorporate(frame_new) tabulated_data = trcrpm.engine.states[0].data_array() expected_data = [ [nan, nan, 86., nan, nan, 57., nan, nan, 65.], [nan, 86., 19., nan, 57., 62., nan, 65., nan], [86., 19., 17., 57., 62., 41., 65., nan, 17.], [19., 17., nan, 62., 41., 7., nan, 17., 30.], [17., nan, 75., 41., 7., 1., 17., 30., 12.], [nan, 75., 45., 7., 1., nan, 30., 12., 72.], [75., 45., 48., 1., nan, 77., 12., 72., 8.], [45., 48., 29., nan, 77., nan, 72., 8., 86.], [48., 29., 83., 77., nan, 46., 8., 86., 38.], [29., 83., nan, nan, 46., 54., 86., 38., nan], [nan, nan, 11, 54., nan, 12, nan, nan, nan], ] assert np.allclose(tabulated_data, expected_data, equal_nan=True)
def test_incorporate_sampleid_existing(): rng = np.random.RandomState() trcrpm = TRCRP_Mixture(chains=1, lag=2, variables=FRAME.columns, rng=rng) trcrpm.incorporate(FRAME) new_frame = pd.DataFrame( [ [nan, nan, 99.], [44., nan, nan], [21., nan, 88.], ], index=[1, 3, 9], columns=['a', 'b', 'c'], ) trcrpm.incorporate(new_frame) tabulated_data = trcrpm.engine.states[0].data_array() expected_data = [ [nan, nan, 86., nan, nan, 57., nan, nan, 65.], [nan, 86., 19., nan, 57., 62., nan, 65., 99.], [86., 19., 17., 57., 62., 41., 65., 99., 17.], [19., 17., 44., 62., 41., 7., 99., 17., 30.], [17., 44., 75., 41., 7., 1., 17., 30., 12.], [44., 75., 45., 7., 1., nan, 30., 12., 72.], [75., 45., 48., 1., nan, 77., 12., 72., 8.], [45., 48., 29., nan, 77., nan, 72., 8., 86.], [48., 29., 83., 77., nan, 46., 8., 86., 38.], [29., 83., 21., nan, 46., 54., 86., 38., 88.], ] assert np.allclose(tabulated_data, expected_data, equal_nan=True)
def test_get_temporal_regimes_crash(): rng = np.random.RandomState(1) trcrpm = TRCRP_Mixture(chains=4, lag=3, variables=FRAME.columns, rng=rng) trcrpm.incorporate(FRAME) for variable in trcrpm.variables: regimes_all = trcrpm.get_temporal_regimes(variable) assert np.shape(regimes_all) == (trcrpm.chains, len(trcrpm.dataset)) regimes_some = trcrpm.get_temporal_regimes(variable, timepoints=[0, 1, 2]) assert np.shape(regimes_some) == (trcrpm.chains, 3)
def test_dependence_constraints(): rng = np.random.RandomState(2) # All variables in TRCRP_Mixture are dependent. trcrpm = TRCRP_Mixture(chains=1, lag=0, variables=FRAME.columns, rng=rng) assert trcrpm._get_variable_dependence_constraints() == [[0, 1, 2]] # All variables in TRCRP_Mixture are dependent. trcrpm = TRCRP_Mixture(chains=1, lag=2, variables=FRAME.columns, rng=rng) assert trcrpm._get_variable_dependence_constraints() == \ [[2,1,0, 5,4,3, 8,7,6]] # All variables in TRCRP_Mixture are dependent. trcrpm = TRCRP_Mixture(chains=1, lag=5, variables=FRAME.columns, rng=rng) assert trcrpm._get_variable_dependence_constraints() == \ [[5,4,3,2,1,0, 11,10,9,8,7,6, 17,16,15,14,13,12]] # Lag 0 dependencies should skip singleton 'c'. trcrpm = Hierarchical_TRCRP_Mixture(chains=1, lag=0, variables=FRAME.columns, rng=rng, dependencies=[['a', 'b'], ['c']]) assert trcrpm._get_variable_dependence_constraints() == [[0, 1]] # Lag 0 dependencies with all constrained. trcrpm = Hierarchical_TRCRP_Mixture(chains=1, lag=0, variables=FRAME.columns, rng=rng, dependencies=[['a', 'b', 'c']]) assert trcrpm._get_variable_dependence_constraints() == [[0, 1, 2]] # Lag 1 dependencies with two constraints. trcrpm = Hierarchical_TRCRP_Mixture(chains=1, lag=1, variables=FRAME.columns, rng=rng, dependencies=[['a', 'b']]) assert trcrpm._get_variable_dependence_constraints() == [[5, 4], [1, 0, 3, 2]] # Lag 1 dependency with single constraint. trcrpm = Hierarchical_TRCRP_Mixture(chains=1, lag=1, variables=FRAME.columns, rng=rng, dependencies=[['a']]) assert trcrpm._get_variable_dependence_constraints() == [[3, 2], [5, 4], [1, 0]] # Multiple customer dependencies. Capturing a runtime error since # incorporate is going to use multiprocess so parallel_map captures the # ValueError and throws a RuntimeError. with pytest.raises(RuntimeError): trcrpm = Hierarchical_TRCRP_Mixture(chains=1, lag=0, variables=FRAME.columns, rng=rng, dependencies=[['a', 'c'], ['a', 'b']]) trcrpm.incorporate(FRAME)
def test_get_cgpm_constraints(): rng = np.random.RandomState(2) trcrpm = TRCRP_Mixture(chains=1, lag=3, variables=FRAME.columns, rng=rng) trcrpm.incorporate(FRAME) # For reference, lag=3 implies the following tabulation # column a column b column c # 0 1 2 3 4 5 6 7 8 9 10 11 # Incorporated timepoints should have no constraints. for timepoint in trcrpm.dataset.index: assert trcrpm._get_cgpm_constraints(timepoint) is None # Fresh timepoint 10 should have appropriate constraints. constraints_10 = trcrpm._get_cgpm_constraints(10) assert constraints_10 == { # column a column b column c 0: 29, 1: 83, 5: 46, 6: 54, 8: 86, 9: 38 } # Incorporating data into timepoint 9 should update constraints for 10. frame_new = pd.DataFrame([[-12, nan, -12]], columns=['a', 'b', 'c'], index=[9]) trcrpm.incorporate(frame_new) constraints_10 = trcrpm._get_cgpm_constraints(10) assert constraints_10 == { # column a column b column c 0: 29, 1: 83, 2: -12, 5: 46, 6: 54, 8: 86, 9: 38, 10: -12 } # Fresh timepoint 12 should have appropriate constraints. constraints_12 = trcrpm._get_cgpm_constraints(12) assert constraints_12 == { # column a column b column c 0: -12, 4: 54, 8: -12 } # Incorporating data into timepoint 11 should update constraints for 12. frame_new = pd.DataFrame([[-44, -48, -47]], columns=['a', 'b', 'c'], index=[11]) trcrpm.incorporate(frame_new) constraints_12 = trcrpm._get_cgpm_constraints(12) assert constraints_12 == { # column a column b column c 0: -12, 2: -44, 4: 54, 6: -48, 8: -12, 10: -47 }
def test_column_indexing(): rng = np.random.RandomState(2) trcrpm = TRCRP_Mixture(chains=1, lag=0, variables=FRAME.columns, rng=rng) assert trcrpm.variables_lagged == [ 'a.lag.0', 'b.lag.0', 'c.lag.0', ] # lag 0 assert trcrpm._variable_to_index('a') == 0 assert trcrpm._variable_to_index('b') == 1 assert trcrpm._variable_to_index('c') == 2 assert trcrpm._variable_indexes() == [0, 1, 2] trcrpm = TRCRP_Mixture(chains=1, lag=2, variables=FRAME.columns, rng=rng) assert trcrpm.variables_lagged == [ 'a.lag.2', 'a.lag.1', 'a.lag.0', 'b.lag.2', 'b.lag.1', 'b.lag.0', 'c.lag.2', 'c.lag.1', 'c.lag.0' ] # lag 0 assert trcrpm._variable_to_index('a') == 2 assert trcrpm._variable_to_index('b') == 5 assert trcrpm._variable_to_index('c') == 8 # lag 1 assert trcrpm._variable_to_index('a', lag=1) == 1 assert trcrpm._variable_to_index('b', lag=1) == 4 assert trcrpm._variable_to_index('c', lag=1) == 7 # lag 2 assert trcrpm._variable_to_index('a', lag=2) == 0 assert trcrpm._variable_to_index('b', lag=2) == 3 assert trcrpm._variable_to_index('c', lag=2) == 6 assert trcrpm._variable_indexes() == [2, 5, 8] assert trcrpm._variable_to_window_indexes('a') == [2, 1, 0] assert trcrpm._variable_to_window_indexes('b') == [5, 4, 3] assert trcrpm._variable_to_window_indexes('c') == [8, 7, 6] trcrpm = TRCRP_Mixture(chains=1, lag=5, variables=FRAME.columns, rng=rng) assert trcrpm.variables_lagged == [ 'a.lag.5', 'a.lag.4', 'a.lag.3', 'a.lag.2', 'a.lag.1', 'a.lag.0', 'b.lag.5', 'b.lag.4', 'b.lag.3', 'b.lag.2', 'b.lag.1', 'b.lag.0', 'c.lag.5', 'c.lag.4', 'c.lag.3', 'c.lag.2', 'c.lag.1', 'c.lag.0' ] # lag 0 assert trcrpm._variable_to_index('a') == 5 assert trcrpm._variable_to_index('b') == 11 assert trcrpm._variable_to_index('c') == 17 # lag 1 assert trcrpm._variable_to_index('a', lag=1) == 4 assert trcrpm._variable_to_index('b', lag=1) == 10 assert trcrpm._variable_to_index('c', lag=1) == 16 # lag 2 assert trcrpm._variable_to_index('a', lag=2) == 3 assert trcrpm._variable_to_index('b', lag=2) == 9 assert trcrpm._variable_to_index('c', lag=2) == 15 # lag 5 assert trcrpm._variable_to_index('a', lag=5) == 0 assert trcrpm._variable_to_index('b', lag=5) == 6 assert trcrpm._variable_to_index('c', lag=5) == 12 assert trcrpm._variable_indexes() == [5, 11, 17] assert trcrpm._variable_to_window_indexes('a') == [5, 4, 3, 2, 1, 0] assert trcrpm._variable_to_window_indexes('b') == [11, 10, 9, 8, 7, 6] assert trcrpm._variable_to_window_indexes('c') == [17, 16, 15, 14, 13, 12]
def test_trcrp_mixture_all_dependent(): rng = np.random.RandomState(2) trcrpm = TRCRP_Mixture(chains=3, lag=3, variables=FRAME.columns, rng=rng) with pytest.raises(ValueError): # No data incorporated yet. trcrpm.resample_all(steps=10) trcrpm.incorporate(FRAME) trcrpm.resample_all(steps=10) for state in trcrpm.engine.states: assert len(state.views) == 1 trcrpm.resample_hyperparameters(steps=5) regimes_a = trcrpm.get_temporal_regimes('a') regimes_b = trcrpm.get_temporal_regimes('b') regimes_c = trcrpm.get_temporal_regimes('c') assert np.all(regimes_a == regimes_b) assert np.all(regimes_a == regimes_c)
import numpy as np from trcrpm import TRCRP_Mixture from collections import Counter print "f**k yeah" data = pd.read_csv("./data/anomaly0245.csv", index_col=0) data = data.iloc[156600:240000].reset_index(drop=True) # Setup the placekeeping and initilizing variables chain = 0 x, eng_val, states, num_states = [], [], [], [] i = 0 step = 30 print(i) rng = np.random.RandomState(1) model = TRCRP_Mixture(chains=1, lag=10, variables=data.columns, rng=rng) model.incorporate(data[i:i + step]) model.resample_all(seconds=10) model.resample_hyperparameters(seconds=10) s = model.get_temporal_regimes('anomaly')[chain] num_states = step * [len(sorted(set(s)))] states = list(s[i:i + step]) eng_val = data.iloc[i:i + step, 0].tolist() x = list(range(i, i + step)) for i in range(step, len(data) - step, step): model.incorporate(data[i:i + step]) model.resample_all(seconds=10) model.resample_hyperparameters(seconds=10) s = model.get_temporal_regimes("anomaly")[chain] num_states = step * [len(sorted(set(s)))]