def test_checksum(self): detector = DocumentFieldMultilineRegexDetector() detector.csv_content = self.csv_text detector.update_checksum() self.assertGreater(len(detector.csv_checksum), 10) cs_old = detector.csv_checksum detector.csv_content = detector.csv_content[:-1] + ';' detector.update_checksum() self.assertGreater(len(detector.csv_checksum), 10) self.assertNotEqual(cs_old, detector.csv_checksum)
def test_combine_dfs(self): # first row is the same, second row - same value, another pattern # third row - same pattern, another value, last row - brand new another_text = """ ,value,pattern 0,"Big Bank & Company (004578) (Knight, Bobby (Charlotte); Bryant, Koby (Charlotte); Williams, Gary (Charlotte); Johnson, Magic (Charlotte); Lobo, Rebecca (Charlotte))","\bbig\s{1,5}bank\s{1,5}.{1,5}\s{1,5}company\s{1,5}(004578)\b" 1,"Family Name Limited (173437) (Tanner, Rebecca (Houston); Saget, Bob (Houston))","\bfamily\s{1,5}guy(173437)\b" 2,"Eye-Eyes Communications (018951)","\ball\s{1,5}eyes\s{1,5}communications\s{1,5}(018951)\b" 3,"John Smith Archives, LLC d/b/a Charlie (085292) (Flay, Bobby (New York))","\bcharlie\s{1,5}(085292)\b" """ with StringIO(another_text) as cs_stream: df = pd.read_csv(cs_stream, usecols=[1, 2]) detector = DocumentFieldMultilineRegexDetector() detector.csv_content = self.csv_text detector.update_checksum() detector.combine_with_dataframe(df) df_new = detector.get_as_pandas_df() row_val = [] # type: List[Tuple[str, str]] for i, row in df_new.iterrows(): row_val.append(( row[0], row[1], )) self.assertEqual(8, len(row_val)) self.assertTrue(( 'John Smith Archives, LLC d/b/a Charlie (085292) (Flay, Bobby (New York))', '\bcharlie\s{1,5}(085292)\b', ) in row_val) self.assertTrue(( 'Big Bank & Company (004578) (Knight, Bobby (Charlotte); Bryant, Koby ' + '(Charlotte); Williams, Gary (Charlotte); Johnson, Magic (Charlotte); ' + 'Lobo, Rebecca (Charlotte))', '\bbig\s{1,5}bank\s{1,5}.{1,5}\s{1,5}company\s{1,5}(004578)\b', ) in row_val) self.assertTrue(( 'Family Name Limited (173437) (Tanner, Rebecca (Houston); Saget, Bob (Houston))', '\bfamily\s{1,5}guy(173437)\b', ) in row_val) self.assertTrue(( 'Family Name Limited (173437) (Tanner, Rebecca (Houston); Saget, Bob (Houston))', '\bfamily\s{1,5}name(173437)\b', ) in row_val) self.assertTrue(( 'Eye-Eyes Communications (018951)', '\ball\s{1,5}eyes\s{1,5}communications\s{1,5}(018951)\b', ) in row_val) # this one is replaced self.assertFalse(( 'All Eyes Communications (018951) (Moore, Michael (New York); Tarantino, Quentin ' + '(San Francisco); Lee, Spike (New York); Levinson, Barry (Charlotte))', '\ball\s{1,5}eyes\s{1,5}communications\s{1,5}(018951)\b', ) in row_val)
def test_get_as_pd(self): detector = DocumentFieldMultilineRegexDetector() detector.csv_content = self.csv_text df = detector.get_as_pandas_df() self.assertIsNotNone(df) self.assertEqual(( 6, 2, ), df.shape)
def setup_mock(): doc_field.uid = 'ABCDEF' doc_field.code = 'client' csv_text = """ ,value,pattern 0,"Big Bank & Company (004578) (Knight, Bobby (Charlotte); Bryant, Koby (Charlotte); Williams, Gary (Charlotte); Johnson, Magic (Charlotte); Lobo, Rebecca (Charlotte))","\bbig\s{1,5}bank\s{1,5}.{1,5}\s{1,5}company\s{1,5}(004578)\b" 1,"Family Name Limited (173437) (Tanner, Rebecca (Houston); Saget, Bob (Houston))","family\s{1,5}name\s{1,5}\(173437\)" 2,"Financial Services & Co. (015607) (Spelling, Tori (Chicago); Priestley, Jason (Dallas); Perry, Luke (New York); Doherty, Shannon (Chicago); Garth, Jenny (Chicago))","\bfinancial\s{1,5}services\s{1,5}.{1,5}(015607)\b" 3,"Food Wholsale, Inc. (056230) (Jenner, Bruce (Chicago))","\bfood\s{1,5}wholsale,(056230)\b" 4,"All Eyes Communications (018951) (Moore, Michael (New York); Tarantino, Quentin (San Francisco); Lee, Spike (New York); Levinson, Barry (Charlotte))","\ball\s{1,5}eyes\s{1,5}communications\s{1,5}(018951)\b" 5,"Joe Smith Archives, LLC d/b/a Foxtrot (085292) (Flay, Bobby (New York))","\bfoxtrot\s{1,5}(085292)\b \bjoe\s{1,5}smith\s{1,5}archives\b" """ detector = DocumentFieldMultilineRegexDetector() detector.csv_content = csv_text detector.document_field = doc_field detector.update_checksum() CsvRegexpsDetectionCacheMock.detector_by_field[doc_field.uid] = detector
def save_detector_settings_csv( self, detectors_by_value: Dict[str, List[str]]) -> None: detector = DocumentFieldMultilineRegexDetector() detector.document_field = self.document_field df = pd.DataFrame(columns=['value', 'pattern']) df.set_index("pattern", inplace=True) for field_val in detectors_by_value: for include_reg_value in detectors_by_value[field_val]: df = df.append( { 'value': field_val, 'pattern': include_reg_value }, ignore_index=True) df.drop_duplicates(subset='pattern', inplace=True) try: existing = DocumentFieldMultilineRegexDetector.objects.get( document_field_id=self.document_field.uid ) # type: DocumentFieldMultilineRegexDetector except DocumentFieldMultilineRegexDetector.DoesNotExist: detector.csv_content = df.to_csv() detector.update_checksum() detector.save() return # just update CSV content and hashsum if self.drop_previous_field_detectors: existing.csv_content = df.to_csv() existing.update_checksum() existing.save() return # join these options with existing one # overwriting duplicates by detected_value or regexp pattern existing.combine_with_dataframe(df) existing.save()