def test_identify_collision(self): importers = [ tests.utils.Importer('A', 'Assets:Tests', 'text/csv'), tests.utils.Importer('B', 'Assets:Tests', 'text/csv'), ] importer = identify.identify(importers, path.abspath('test.txt')) self.assertIsNone(importer) with self.assertRaises(exceptions.Error): importer = identify.identify(importers, path.abspath('test.csv'))
def _identify(ctx, src, failfast, verbose): """Identify files for import. Walk the SRC list of files or directories and report each file identified by one of the configured importers. When verbose output is requested, also print the account name associated to the document by the importer. """ log = utils.logger(verbose) errors = exceptions.ExceptionsTrap(log) for filename in _walk(src, log): with errors: importer = identify.identify(ctx.importers, filename) if not importer: log('') # Newline. continue # Signal processing of this document. log(' ...', nl=False) # When verbose output is requested, get the associated account. account = importer.file_account( cache.get_file(filename)) if verbose else None log(' OK', fg='green') log(f' {importer.name():}') log(f' {account:}', 1) if failfast and errors: break if errors: sys.exit(1)
def test_identify(self): importers = [ tests.utils.Importer('A', 'Assets:Tests', 'application/pdf'), tests.utils.Importer('B', 'Assets:Tests', 'text/csv'), ] # Pass an absolute path to identify() to make the cache code # used internally by the importers happy. This can go away # once FileMemo is removed from the importers interface. importer = identify.identify(importers, path.abspath('test.txt')) self.assertIsNone(importer) importer = identify.identify(importers, path.abspath('test.pdf')) self.assertEqual(importer.name, 'A') importer = identify.identify(importers, path.abspath('test.csv')) self.assertEqual(importer.name, 'B')
def _extract(ctx, src, output, existing, reverse, failfast, quiet): """Extract transactions from documents. Walk the SRC list of files or directories and extract the ledger entries from each file identified by one of the configured importers. The entries are written to the specified output file or to the standard output in Beancount ledger format in sections associated to the source document. """ verbosity = -quiet log = utils.logger(verbosity, err=True) errors = exceptions.ExceptionsTrap(log) # Load the ledger, if one is specified. existing_entries = loader.load_file(existing)[0] if existing else [] extracted = [] for filename in _walk(src, log): with errors: importer = identify.identify(ctx.importers, filename) if not importer: log('') # Newline. continue # Signal processing of this document. log(' ...', nl=False) # Extract entries. entries = extract.extract_from_file(importer, filename, existing_entries) extracted.append((filename, entries)) log(' OK', fg='green') if failfast and errors: break # Invoke hooks. hooks = [extract.find_duplicate_entries ] if ctx.hooks is None else ctx.hooks for func in hooks: extracted = func(extracted, existing_entries) # Reverse sort order, if requested. if reverse: for filename, entries in extracted: entries.reverse() # Serialize entries. extract.print_extracted_entries(extracted, output) if errors: sys.exit(1)
def _archive(ctx, src, destination, dry_run, overwrite, failfast): """Archive documents. Walk the SRC list of files or directories and move each file identified by one of the configured importers in a directory hierarchy mirroring the structure of the accounts associated to the documents and with a file name composed by the document date and document name returned by the importer. Documents are moved to their filing location only when no errors are encountered processing all the input files. Documents in the destination directory are not overwritten, unless the --force option is used. When the directory hierarchy root is not specified with the --destination DIR options, it is assumed to be directory in which the ingest script is located. """ # If the output directory is not specified, move the files at the # root where the import script is located. Providing this default # seems better than using a required option. if destination is None: import __main__ destination = os.path.dirname(os.path.abspath(__main__.__file__)) log = utils.logger() errors = exceptions.ExceptionsTrap(log) renames = [] for filename in _walk(src, log): with errors: importer = identify.identify(ctx.importers, filename) if not importer: log('') # Newline. continue # Signal processing of this document. log(' ...', nl=False) destpath = archive.filepath(importer, filename) # Prepend destination directory path. destpath = os.path.join(destination, destpath) # Check for destination filename collisions. collisions = [src for src, dst in renames if dst == destpath] if collisions: raise exceptions.Error('Collision in destination file path.', destpath) # Check if the destination file already exists. if not overwrite and os.path.exists(destpath): raise exceptions.Error('Destination file already exists.', destpath) renames.append((filename, destpath)) log(' OK', fg='green') log(f' {destpath:}') if failfast and errors: break # If there are any errors, stop here. if errors: log('# Errors detected: documents will not be filed.') sys.exit(1) if not dry_run: for src, dst in renames: archive.move(src, dst)
def identify(self, what, *args, **kwargs): identify.identify(self.importers, what, *args, **kwargs)