# 6-> Simplest absorbing state case for validation purposes # DURATION TYPE DATASETS (Long format) # 7-> S&P Credit Rating Migration Matrix # 8-> Simplest absorbing state case for validation purposes (Duration estimator) # 9-> Example with dates in string formats dataset = 9 # # Duration type datasets in Compact Format # if dataset == 1: # This dataset simulates single entity transitions # State Space definition myState = tm.StateSpace([('0', "A"), ('1', "B"), ('2', "C"), ('3', "D")]) # myState.describe() # n: number of entities # s: number of samples per entity data = dataset_generators.exponential_transitions(myState, n=1, sample=100, rate=0.1) sorted_data = data.sort_values(['ID', 'Time'], ascending=[True, True]) sorted_data.to_csv(dataset_path + 'synthetic_data1.csv', index=False) elif dataset == 2: # Second example: Multiple Entities observed over continuous short time interval myState = tm.StateSpace([('0', "Basic"), ('1', "Default")]) data = dataset_generators.exponential_transitions(myState, n=1000,
# Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, # either express or implied. See the License for the specific language governing permissions and # limitations under the License. # Example script. Open Risk Academy Course Step 4. import pandas as pd import transitionMatrix as tm from transitionMatrix.estimators import cohort_estimator as es dataset_path = "../../datasets/" data = pd.read_csv(dataset_path + 'synthetic_data2.csv', dtype={'State': str}) sorted_data = data.sort_values(['ID', 'Time'], ascending=[True, True]) myState = tm.StateSpace([('0', "Basic"), ('1', "Default")]) myState.describe() print(myState.validate_dataset(dataset=sorted_data)) cohort_data, cohort_intervals = tm.utils.bin_timestamps(data, cohorts=5) myEstimator = es.CohortEstimator(states=myState, ci={'method': 'goodman', 'alpha': 0.05}) labels = {'Timestamp': 'Cohort', 'State': 'State', 'ID': 'ID'} result = myEstimator.fit(cohort_data, labels=labels) myMatrixSet = tm.TransitionMatrixSet(values=result, temporal_type='Incremental') print(myMatrixSet.temporal_type) myMatrixSet.print_matrix()
import transitionMatrix as tm from transitionMatrix import source_path from transitionMatrix.estimators import simple_estimator as es dataset_path = source_path + "datasets/" # Example: LendingClub Style Migration Matrix Set # Load historical data into pandas frame # Format: # Expected Data Format is (ID, State_IN, State_OUT) definition = [('A', "Grade A"), ('B', "Grade B"), ('C', "Grade C"), ('D', "Grade D"), ('E', "Grade E"), ('F', "Grade F"), ('G', "Grade G"), ('H', "Delinquent"), ('I', "Charged Off"), ('J', "Repaid")] myState = tm.StateSpace(definition) # Load the data sets into a pandas frame in sequence # Check matrix_lendingclub.py for comments matrix_set = [] for letter in ['a', 'b', 'c', 'd']: # store the derived one-period matrices in a list data = pd.read_csv(dataset_path + 'LoanStats3' + letter + '_Step2.csv') myEstimator = es.SimpleEstimator(states=myState, ci={'method': 'goodman', 'alpha': 0.05}) result = myEstimator.fit(data) myEstimator.summary() myMatrix = tm.TransitionMatrix(result) myMatrix[7, 9] = 1.0 myMatrix[8, 9] = 1.0 myMatrix[9, 9] = 1.0
def test_validate_dataset(self): dataset_path = source_path + "datasets/" data = pd.read_csv(dataset_path + 'test.csv', dtype={'State': str}) description = [('0', "Stage 1"), ('1', "Stage 2"), ('2', "Stage 3")] s = tm.StateSpace(description) self.assertEqual(s.validate_dataset(dataset=data)[0], "Dataset contains the expected states.")
bottom=0.1, right=0.9, top=0.9, wspace=0, hspace=0.4) f.suptitle(summary, fontsize=12) plt.show() elif example == 3: # # Histogram Plots of transition frequencies # data = pd.read_csv('../datasets/synthetic_data5.csv', dtype={'State': str}) sorted_data = data.sort_values(['ID', 'Timestep'], ascending=[True, True]) description = [('0', "Stage 1"), ('1', "Stage 2"), ('2', "Stage 3")] myState = tm.StateSpace(description) myState.describe() myEstimator = es.CohortEstimator(states=myState, ci={ 'method': 'goodman', 'alpha': 0.05 }) result = myEstimator.fit(sorted_data) # Packaging step viz_data = [] for k in range(len(result)): for s in range(len(myState.get_states())): raw_data = result[k][s, :] viz_data.append(raw_data)
def test_generic(self): s = tm.StateSpace() n = 10 s.generic(n=n) self.assertEqual(s.get_state_labels()[n-1], str(n-1))
def test_get_state_labels(self): description = [('0', "AAA"), ('1', "AA"), ('2', "A"), ('3', "BBB"), ('4', "BB"), ('5', "B"), ('6', "CCC"), ('7', "D")] s = tm.StateSpace(description) self.assertEqual(s.get_state_labels()[0], 'AAA')
def test_instantiate_state(self): description = [('0', "AAA"), ('1', "AA"), ('2', "A"), ('3', "BBB"), ('4', "BB"), ('5', "B"), ('6', "CCC"), ('7', "D")] s = tm.StateSpace(description) self.assertEqual(s.description[0][1], 'AAA')
def test_cohort_estimator_matrix(self): """ Test that the estimated matrix is same as the matrix that was used to generate the data matrix = [[0.8, 0.15, 0.05], [0.1, 0.7, 0.2], [0.0, 0.0, 1.0]] """ dataset_path = source_path + "datasets/" data = pd.read_csv(dataset_path + 'synthetic_data5.csv') definition = [('0', "Stage 1"), ('1', "Stage 2"), ('2', "Stage 3")] myState = tm.StateSpace(definition) sorted_data = data.sort_values(['ID', 'Time'], ascending=[True, True]) myEstimator = es.CohortEstimator(states=myState, cohort_bounds=[0, 1, 2, 3, 4], ci={ 'method': 'goodman', 'alpha': 0.05 }) result = myEstimator.fit(sorted_data) am = myEstimator.average_matrix self.assertAlmostEqual(am[0, 0], 0.8, places=ACCURATE_DIGITS, msg=None, delta=None) self.assertAlmostEqual(am[0, 1], 0.15, places=ACCURATE_DIGITS, msg=None, delta=None) self.assertAlmostEqual(am[0, 2], 0.05, places=ACCURATE_DIGITS, msg=None, delta=None) self.assertAlmostEqual(am[1, 0], 0.1, places=ACCURATE_DIGITS, msg=None, delta=None) self.assertAlmostEqual(am[1, 1], 0.7, places=ACCURATE_DIGITS, msg=None, delta=None) self.assertAlmostEqual(am[1, 2], 0.2, places=ACCURATE_DIGITS, msg=None, delta=None) self.assertAlmostEqual(am[2, 0], 0.0, places=ACCURATE_DIGITS, msg=None, delta=None) self.assertAlmostEqual(am[2, 1], 0.0, places=ACCURATE_DIGITS, msg=None, delta=None) self.assertAlmostEqual(am[2, 2], 1.0, places=ACCURATE_DIGITS, msg=None, delta=None)
dataset_path = source_path + "datasets/" # Select the example to run # 1-> An example with limited data (dataset contains only one entity) # 2-> A full example with a 2x2 matrix # 3-> A full example with a 8x8 matrix example = 4 if example == 1: # An example with limited data (dataset contains only one entity) data = pd.read_csv(dataset_path + 'synthetic_data1.csv', dtype={'State': str}) sorted_data = data.sort_values(['ID', 'Time'], ascending=[True, True]) myState = tm.StateSpace([('0', "A"), ('1', "B"), ('2', "C"), ('3', "D")]) print("> Validate data set") print(myState.validate_dataset(dataset=sorted_data)) # Bin the data into 5 intervals cohort_data, cohort_intervals = tm.utils.bin_timestamps(data, cohorts=5) print("> Cohort intervals: ", cohort_intervals) print(80 * '=') print("> Cohort data") print(cohort_data) myEstimator = es.CohortEstimator(states=myState, ci={ 'method': 'goodman', 'alpha': 0.05 }) labels = {'Time': 'Cohort', 'State': 'State', 'ID': 'ID'} print(80 * '=')