def test_explain_instance_Right_Right_addBEFORELeft_NOoverlap(self): lstring1, lstring2, rstring1, rstring2 = 'l1 l2 l3 l1 l5', 'm1 m2 m3 m4', 'l1 r2 l3 r1', 's1 s2 m2' el = pd.DataFrame( [[1, lstring1, lstring2, rstring1, rstring2]], columns=['id', 'left_A', 'left_B', 'right_A', 'right_B']) explainer = Landmark(self.fake_pred, el, lprefix='left_', rprefix='right_', split_expression=r' ') expl = explainer.explain_instance(el, variable_side='right', fixed_side='right', add_before_perturbation='left', overlap=False, num_samples=500) encoded = 'A00_l1 A01_r2 A02_l3 A03_r1 A04_l2 A05_l5 B00_s1 B01_s2 B02_m2 B03_m1 B04_m3 B05_m4' self.assertEqual(explainer.variable_data, encoded) self.assertTrue( explainer.fixed_data.equals( el[[x for x in el.columns if x.startswith('right_')]])) self.assertEqual( [x[0] for x in explainer.explanations['right1'].as_list()], re.split(' ', encoded))
def test_explain_instance_Right_Right_addLeftAFTER(self): lstring1, lstring2, rstring1, rstring2 = 'l1 l2 l3 l1 l5', 'm1 m2 m3 m4', 'r1 r2 r3', 's1 s2' el = pd.DataFrame( [[1, lstring1, lstring2, rstring1, rstring2]], columns=['id', 'left_A', 'left_B', 'right_A', 'right_B']) explainer = Landmark(self.fake_pred, el, lprefix='left_', rprefix='right_', split_expression=r' ') expl = explainer.explain_instance(el, variable_side='right', fixed_side='right', add_after_perturbation='left', num_samples=500) self.assertTrue( explainer.tmp_dataset.left_A.str.endswith(lstring1).all()) self.assertTrue( explainer.tmp_dataset.left_B.str.endswith(lstring2).all()) encoded = 'A00_r1 A01_r2 A02_r3 B00_s1 B01_s2' self.assertEqual(explainer.variable_data, encoded) self.assertTrue( explainer.fixed_data.equals( el[[x for x in el.columns if x.startswith('right_')]])) self.assertEqual( [x[0] for x in explainer.explanations['right1'].as_list()], re.split(' ', encoded))
def test_explain_instance_leftRight_addAFTERRight(self): lstring1, lstring2, rstring1, rstring2 = 'l1 l2 l3 l4', 'm1 m2 m3 m4', 'r1 r2 r3', 's1 s2 s3 s4 s5' left_string = lstring1 + ' ' + lstring2 right_string = rstring1 + ' ' + rstring2 el = pd.DataFrame( [[1, lstring1, lstring2, rstring1, rstring2]], columns=['id', 'left_A', 'left_B', 'right_A', 'right_B']) explainer = Landmark(self.fake_pred, el, lprefix='left_', rprefix='right_', split_expression=r' ') expl = explainer.explain_instance(el, variable_side='left', fixed_side='right', add_after_perturbation='right', num_samples=500) self.assertTrue( explainer.tmp_dataset.left_A.str.endswith(rstring1).all()) self.assertTrue( explainer.tmp_dataset.left_B.str.endswith(rstring2).all()) assert len(explainer.explanations['right1'].as_list()) == len( left_string.split(' ')) rstring1, rstring2 = 'r1 r2-r3', 's1 s2+\'s3 s4 s\|_.,#ù[{5' right_string = rstring1 + ' ' + rstring2 el = pd.DataFrame( [[1, lstring1, lstring2, rstring1, rstring2]], columns=['id', 'left_A', 'left_B', 'right_A', 'right_B']) expl = explainer.explain_instance(el, variable_side='left', fixed_side='right', add_after_perturbation='right', num_samples=500) self.assertTrue( explainer.tmp_dataset.left_A.str.endswith(rstring1).all()) self.assertTrue( explainer.tmp_dataset.left_B.str.endswith(rstring2).all()) #self.assertEqual(explainer.tmp_dataset.columns, el.columns) assert explainer.fixed_data.equals( pd.DataFrame({ 'right_A': [rstring1], 'right_B': [rstring2] })) encoded = 'A00_l1 A01_l2 A02_l3 A03_l4 B00_m1 B01_m2 B02_m3 B03_m4' self.assertEqual(explainer.variable_data, encoded) self.assertEqual( [x[0] for x in explainer.explanations['right1'].as_list()], re.split(' ', encoded))
def test_evaluate_set(self): lstring1, lstring2, rstring1, rstring2 = 'l1 l2', 'm1 m2', 'r1', 's1 s2 s3' el = pd.DataFrame([[1, lstring1, lstring2, rstring1, rstring2]], columns=['id', 'left_A', 'left_B', 'right_A', 'right_B']) explainer = Landmark(self.random_pred, el, exclude_attrs=[], lprefix='left_', rprefix='right_', split_expression=r' ') impacts_match = explainer.explain(el, num_samples=self.num_samples) ev = Evaluate_explanation(impacts_match, el, predict_method=self.random_pred, percentage=.25, num_round=20) results = ev.evaluate_set([1], 'all', variable_side='all') encoded = 'A00_l1 A01_l2 B00_m1 B01_m2 C00_r1 D00_s1 D01_s2 D02_s3' self.assertEqual(ev.variable_encoded, encoded) self.assertEqual(ev.fixed_data, None) self.assertEqual(results.id.unique(), [1])
def test_Evaluate_Right_Left_addBEFORELeft(self): lstring1, lstring2, rstring1, rstring2 = 'l1 l2 l3 l1 l5', 'm1 m2', 'r1 r2 r3', 's1 s2' el = pd.DataFrame([[1, lstring1, lstring2, rstring1, rstring2]], columns=['id', 'left_A', 'left_B', 'right_A', 'right_B']) explainer = Landmark(self.random_pred, el, lprefix='left_', rprefix='right_', split_expression=r' ') impacts_match = explainer.explain_instance(el, variable_side='right', fixed_side='left', add_before_perturbation='left', num_samples=self.num_samples) conf_name = 'R_R+Lafter' impacts_match['conf'] = conf_name ev = Evaluate_explanation(impacts_df=impacts_match, dataset=el, predict_method=self.random_pred, percentage=.25, num_round=5) results = ev.evaluate_set([1], conf_name, variable_side='right', fixed_side='left', add_before_perturbation='left') encoded = 'A00_r1 A01_r2 A02_r3 A03_l1 A04_l2 A05_l3 A06_l1 A07_l5 B00_s1 B01_s2 B02_m1 B03_m2' self.assertEqual(ev.variable_encoded, encoded) self.assertTrue(ev.fixed_data.equals(el[[x for x in el.columns if x.startswith('left_')]]))
def test_Evaluate_Right_Right_addAFTERLeft(self): lstring1, lstring2, rstring1, rstring2 = 'l1 l2 l3 l1 l5', 'm1 m2 m3 m4', 'r1 r2 r3', 's1 s2' el = pd.DataFrame([[1, lstring1, lstring2, rstring1, rstring2]], columns=['id', 'left_A', 'left_B', 'right_A', 'right_B']) explainer = Landmark(self.random_pred, el, lprefix='left_', rprefix='right_', split_expression=r' ') impacts_match = explainer.explain_instance(el, variable_side='right', fixed_side='right', add_after_perturbation='left', num_samples=self.num_samples) conf_name = 'R_R+Lafter' impacts_match['conf'] = conf_name ev = Evaluate_explanation(impacts_match, el, predict_method=self.random_pred, percentage=.25, num_round=5) results = ev.evaluate_set([1], conf_name, variable_side='right', fixed_side='right', add_after_perturbation='left') self.assertTrue(ev.perturbed_elements.left_A.str.endswith(lstring1).all()) self.assertTrue(ev.perturbed_elements.left_B.str.endswith(lstring2).all()) encoded = 'A00_r1 A01_r2 A02_r3 B00_s1 B01_s2' self.assertEqual(ev.variable_encoded, encoded) self.assertTrue(ev.fixed_data.equals(el[[x for x in el.columns if x.startswith('right_')]]))
def test_explain_instance_ALL(self): lstring1, lstring2, rstring1, rstring2 = 'l1 l2 l3 l4', 'm1 m2 m3 m4', 'r1 r2 r3', 's1 s2 ' el = pd.DataFrame([[1, 0.9, lstring1, lstring2, rstring1, rstring2]], columns=[ 'id', 'match_score', 'left_A', 'left_B', 'right_A', 'right_B' ]) explainer = Landmark(self.fake_pred, el, lprefix='left_', rprefix='right_', split_expression=r' ') expl = explainer.explain_instance(el, variable_side='all', num_samples=500) encoded = 'A00_l1 A01_l2 A02_l3 A03_l4 B00_m1 B01_m2 B02_m3 B03_m4 ' \ 'C00_r1 C01_r2 C02_r3 D00_s1 D01_s2' self.assertEqual(explainer.variable_data, encoded) self.assertEqual( [x[0] for x in explainer.explanations['all1'].as_list()], re.split(' ', encoded))
def test_explain_instance_leftRight_addBEFORERight(self): lstring1, lstring2, rstring1, rstring2 = 'l1 l2 l3 l1 l5', 'm1 m2 m3 m4', 'r1 r2 r3', 's1 s2' el = pd.DataFrame( [[1, lstring1, lstring2, rstring1, rstring2]], columns=['id', 'left_A', 'left_B', 'right_A', 'right_B']) explainer = Landmark(self.fake_pred, el, lprefix='left_', rprefix='right_', split_expression=r' ') expl = explainer.explain_instance(el, variable_side='left', fixed_side='right', add_before_perturbation='right', num_samples=500) encoded = 'A00_l1 A01_l2 A02_l3 A03_l1 A04_l5 A05_r1 A06_r2 A07_r3 B00_m1 B01_m2 B02_m3 B03_m4 B04_s1 B05_s2' self.assertEqual(explainer.variable_data, encoded) self.assertEqual( [x[0] for x in explainer.explanations['right1'].as_list()], re.split(' ', encoded)) lstring1, lstring2, rstring1, rstring2 = 'l1 l2 l3 l4', 'm1 m2 m3 m4', 'r1 r2-r3 l4', 's1 s2+\'s3 m2' el = pd.DataFrame( [[1, lstring1, lstring2, rstring1, rstring2]], columns=['id', 'left_A', 'left_B', 'right_A', 'right_B']) expl = explainer.explain_instance(el, variable_side='left', fixed_side='right', add_before_perturbation='right', num_samples=500) encoded = 'A00_l1 A01_l2 A02_l3 A03_l4 A04_r1 A05_r2-r3 A06_l4 B00_m1 B01_m2 B02_m3 B03_m4 B04_s1 B05_s2+\'s3 B06_m2' self.assertEqual(explainer.variable_data, encoded) self.assertEqual( [x[0] for x in explainer.explanations['right1'].as_list()], re.split(' ', encoded))
def test_Mapper_encode_attr(self): lstring1, lstring2, rstring1, rstring2 = 'l1 l2 l3 l4', 'm1 m2 m3 m4', 'r1 r2 r3', 's1 s2 s3 s4 s5' left_string = lstring1 + ' ' + lstring2 right_string = rstring1 + ' ' + rstring2 el = pd.DataFrame( [[1, lstring1, lstring2, rstring1, rstring2]], columns=['id', 'left_A', 'left_B', 'right_A', 'right_B']) explainer = Landmark(self.fake_pred, el, lprefix='left_', rprefix='right_', split_expression=r' ') expl = explainer.explain_instance(el, variable_side='left', fixed_side='right', add_after_perturbation='right', num_samples=500) assert explainer.fixed_data.equals( pd.DataFrame({ 'right_A': [rstring1], 'right_B': [rstring2] })) assert explainer.variable_data == 'A00_l1 A01_l2 A02_l3 A03_l4 B00_m1 B01_m2 B02_m3 B03_m4' explainer = Landmark( self.fake_pred, el, lprefix='left_', rprefix='right_', split_expression=r'\W+') # Change split expression """ The \W metacharacter is used to find a non-word character. A word character is a character from a-z, A-Z, 0-9, including the _ (underscore) character.""" lstring1, lstring2, rstring1, rstring2 = 'l1 l2_.l3 l1^.,l5', 'm1 m2 m3 m4', 'r1 r2-r3', 's1 s2+\'s3 s4 s\|_.,#ù[{5' left_string = lstring1 + ' ' + lstring2 el = pd.DataFrame( [[1, lstring1, lstring2, rstring1, rstring2]], columns=['id', 'left_A', 'left_B', 'right_A', 'right_B']) expl = explainer.explain_instance(el, variable_side='left', fixed_side='right', add_after_perturbation='right', num_samples=500) assert explainer.fixed_data.equals( pd.DataFrame({ 'right_A': ['r1 r2 r3'], 'right_B': ['s1 s2 s3 s4 s _ ù 5'] })) assert explainer.variable_data == 'A00_l1 A01_l2_ A02_l3 A03_l1 A04_l5 B00_m1 B01_m2 B02_m3 B03_m4' assert len(explainer.explanations['right1'].as_list()) == len( re.split(r'\W+', left_string))
def setUpClass(cls) -> None: dataset_path = 'C:\\Users\\Barald\\UNI Gdrive\\EM Explanations Baraldi\\datasets' dataset_path = os.path.join(dataset_path, 'Abt-Buy') cls.data = pd.read_csv(os.path.join(dataset_path, 'test.csv')) fake_pred = lambda x: np.ones((x.shape[0], )) * 0.5 proba = fake_pred(cls.data) cls.tp_group = cls.data[(1 - proba >= 0.5) & (cls.data['label'] == '1')].head(2) cls.tn_group = cls.data[(proba >= 0.5) & (cls.data['label'] == '0')].head(2) cls.exclude_attrs = ['left_id', 'right_id', 'label', 'id'] cls.explainer = Landmark(fake_pred, cls.data, exclude_attrs=cls.exclude_attrs, lprefix='left_', rprefix='right_', split_expression=r' ') cls.el = cls.data.iloc[[126]].copy()