def test_find_similar_entries(self, entries, _, __): """ plugin "beancount.plugins.auto_accounts" 2016-01-03 * Expenses:Tips 1.03 USD Assets:Other 2016-01-04 * Expenses:Coffee 1.04 USD Assets:Other 2016-01-05 * Expenses:Restaurant 1.05 USD Assets:Other 2016-01-06 * Expenses:Groceries 1.06 USD Assets:Other 2016-01-07 * Expenses:Alcohol 1.07 USD Assets:Other 2016-01-08 * Expenses:Smoking 1.08 USD Assets:Other 2016-01-09 * Expenses:Taxi 1.09 USD Assets:Other """ new_entries, _, __ = loader.load_string(""" plugin "beancount.plugins.auto_accounts" 2016-01-06 * Expenses:Groceries 1.06 USD Assets:Other """) for days, num_comparisons in [(0, 1), (1, 1), (2, 1)]: duplicates = similar.find_similar_entries(new_entries, entries, lambda e1, e2: True, window_days=days) self.assertEqual(num_comparisons, len(duplicates)) duplicates = similar.find_similar_entries(new_entries, entries, lambda e1, e2: False, window_days=days) self.assertEqual(0, len(duplicates))
def find_duplicate_entries(new_entries_list, existing_entries): """Flag potentially duplicate entries. Args: new_entries_list: A list of pairs of (key, lists of imported entries), one for each importer. The key identifies the filename and/or importer that yielded those new entries. existing_entries: A list of previously existing entries from the target ledger. Returns: A list of lists of modified new entries (like new_entries_list), potentially with modified metadata to indicate those which are duplicated. """ mod_entries_list = [] for key, new_entries in new_entries_list: # Find similar entries against the existing ledger only. duplicate_pairs = similar.find_similar_entries(new_entries, existing_entries) # Add a metadata marker to the extracted entries for duplicates. duplicate_set = set(id(entry) for entry, _ in duplicate_pairs) mod_entries = [] for entry in new_entries: if id(entry) in duplicate_set: marked_meta = entry.meta.copy() marked_meta[DUPLICATE_META] = True entry = entry._replace(meta=marked_meta) mod_entries.append(entry) mod_entries_list.append((key, mod_entries)) return mod_entries_list
def __call__(self, importer, file, imported_entries, existing_entries): """Add duplicate metadata for imported transactions. Args: imported_entries: The list of imported entries. existing_entries: The list of existing entries as passed to the importer. Returns: A list of entries, modified by this detector. """ duplicate_pairs = similar.find_similar_entries( imported_entries, existing_entries, self.comparator, self.window_days, ) # Add a metadata marker to the extracted entries for duplicates. duplicate_set = set(id(entry) for entry, _ in duplicate_pairs) mod_entries = [] for entry in imported_entries: if id(entry) in duplicate_set: marked_meta = entry.meta.copy() marked_meta["__duplicate__"] = True entry = entry._replace(meta=marked_meta) mod_entries.append(entry) return mod_entries
def find_and_delete_duplicates(l): for e1 in l: duplicates = similar.find_similar_entries([e1], l, window_days=4) for s, d in duplicates: if d != e1: l.remove(d) for e1 in l: U = UmbuchungsComparator(datetime.timedelta(14)) duplicates_umbuchung = similar.find_similar_entries([e1], l, comparator=U, window_days=14) for s, d in duplicates_umbuchung: print(id(d)) printer.print_entries([d]) print(id(e1)) printer.print_entries([e1]) print('\n' + '=' * 20) l.remove(d)
def test_find_similar_entries__multiple_matches(self, entries, _, __): """ plugin "beancount.plugins.auto_accounts" 2016-02-01 * "A" Assets:Account1 10.00 USD Assets:Account2 -10.00 USD 2016-02-02 * "B" Assets:Account1 10.00 USD Assets:Account2 -10.00 USD 2016-02-03 * "C" Assets:Account1 10.00 USD Assets:Account2 -10.00 USD 2016-02-04 * "D" Assets:Account1 10.00 USD Assets:Account2 -10.00 USD 2016-02-05 * "D" Assets:Account1 10.00 USD Assets:Account2 -10.00 USD """ # Test it with a single entry. new_entries = list(data.filter_txns(entries))[2:3] duplicates = similar.find_similar_entries(new_entries, entries, window_days=1) self.assertEqual(1, len(duplicates)) self.assertEqual(new_entries[0], duplicates[0][0]) # Test it with multiple duplicate entries. new_entries = list(data.filter_txns(entries))[1:4] duplicates = similar.find_similar_entries(new_entries, entries, window_days=1) self.assertEqual(len(new_entries), len(duplicates))
def entry_is_in_ledger(self, entry): '''Used to determine if entry is in ledger AFTER the entry has been processed.''' duplicates_um = similar.find_similar_entries([entry], self.ledger, comparator=self.U, window_days=14) if len(duplicates_um) == 0: return False try: print('Umbuchung') except Exception as e: print(e) return True
def annotate_duplicate_entries(self, new_entries): """Flag potentially duplicate entries. Args: new_entries: A list of lists of imported entries, one for each importer. Returns: Modifies new_entries in-place, potentially with modified metadata to indicate those which are duplicated. """ # Find similar entries against the existing ledger only. duplicate_pairs = similar.find_similar_entries(new_entries, self.existing_entries) # Add a metadata marker to the extracted entries for duplicates. duplicate_set = set(id(entry) for entry, _ in duplicate_pairs) for entry in new_entries: if id(entry) in duplicate_set: entry.meta[DUPLICATE_META] = True
def extract_from_file(filename, importer, existing_entries=None, min_date=None, allow_none_for_tags_and_links=False): """Import entries from file 'filename' with the given matches, Also cross-check against a list of provided 'existing_entries' entries, de-duplicating and possibly auto-categorizing. Args: filename: The name of the file to import. importer: An importer object that matched the file. existing_entries: A list of existing entries parsed from a ledger, used to detect duplicates and automatically complete or categorize transactions. min_date: A date before which entries should be ignored. This is useful when an account has a valid check/assert; we could just ignore whatever comes before, if desired. allow_none_for_tags_and_links: A boolean, whether to allow plugins to generate Transaction objects with None as value for the 'tags' or 'links' attributes. Returns: A list of new imported entries and a subset of these which have been identified as possible duplicates. Raises: Exception: If there is an error in the importer's extract() method. """ # Extract the entries. file = cache.get_file(filename) # Note: Let the exception through on purpose. This makes developing # importers much easier by rendering the details of the exceptions. new_entries = importer.extract(file) if not new_entries: return [], [] # Make sure the newly imported entries are sorted; don't trust the importer. new_entries.sort(key=data.entry_sortkey) # Ensure that the entries are typed correctly. for entry in new_entries: data.sanity_check_types(entry, allow_none_for_tags_and_links) # Filter out entries with dates before 'min_date'. if min_date: new_entries = list(itertools.dropwhile(lambda x: x.date < min_date, new_entries)) # Find potential matching entries. duplicate_entries = [] if existing_entries is not None: duplicate_pairs = similar.find_similar_entries(new_entries, existing_entries) duplicate_set = set(id(entry) for entry, _ in duplicate_pairs) # Add a metadata marker to the extracted entries for duplicates. mod_entries = [] for entry in new_entries: if id(entry) in duplicate_set: marked_meta = entry.meta.copy() marked_meta[DUPLICATE_META] = True entry = entry._replace(meta=marked_meta) duplicate_entries.append(entry) mod_entries.append(entry) new_entries = mod_entries return new_entries, duplicate_entries