def collect_entry(indexfh, identifier): """ find position and size of entries corresponding to specified identifier from index """ if args.duplicates: positions = get_positions(indexfh, identifier) elif args.zfound: positions = [get_position_last(indexfh, identifier)] else: positions = [get_position_first(indexfh, identifier)] if not positions or positions == [None]: #empty array if args.verbose: eprint(" => WARNING: '{}' not found in index; skipping".format( identifier)) return [], [] entry_positions = list() entry_lengths = list() for position in positions: #1) extract position and entry size if args.encrypted: #with iv posmatch = REESIVN.match(position) else: #no iv posmatch = REES.match(position) entry_position, entry_length = posmatch.groups() #2) decode and append to lists entry_positions.append(b64_to_int(entry_position)) entry_lengths.append(b64_to_int(entry_length)) return entry_positions, entry_lengths
def check_index_compatibility(indexfile1, indexfile2): """ will check if old and new indexes are of same type """ with open(indexfile1, 'r', 1) as indexfh: line1 = indexfh.readline() #first line of index index_type1, cipher1, keysize1, has_checksum1 = check_index(line1) with open(indexfile2, 'r', 1) as indexfh: line2 = indexfh.readline() #first line of index index_type2, cipher2, keysize2, has_checksum2 = check_index(line2) if index_type1 != index_type2 or cipher1 != cipher2 or keysize1 != keysize2: eprint( " => ERROR: The indexes you are merging '{}' and '{}'".format( indexfile1, indexfile2)) eprint(" are of incompatible type! Cannot proceed!") sys.exit(22) if has_checksum1 != has_checksum2: eprint( " => ERROR: The indexes you are merging '{}' and '{}'".format( indexfile1, indexfile2)) eprint( " are of the same type but only one contains checksums!" ) eprint( " Both should be either with or without checksums. Cannot proceed!" ) sys.exit(22) args.index_type = index_type1
def check_files(): """ check for ability to open input/output filenames """ check_iofiles([args.input_filename], []) args.outff_filename = None if args.encrypt: args.outff_filename = "{}.{}".format(args.input_filename, "enc") elif args.compress: #if compressed but not encrypted args.outff_filename = "{}.{}".format(args.input_filename, "xz") if args.outff_filename is not None: check_iofiles([], [args.outff_filename]) if args.threads > 1: #multithread os.mkdir(args.mt_subfiles_dir, mode=0o700) args.chunk_itempfiles = list() for chunknum in range(args.chunks_count): chunknumstr = str(chunknum).zfill(len(str( args.chunks_count))) #e.g. 00, 01.. chunk_tempfile = args.mt_subfiles_iprefix + "." + chunknumstr try: myoutputfh = open(chunk_tempfile, 'w') except PermissionError: eprint( " => ERROR: Cannot open temporary file '{}' for writing" .format(chunk_tempfile)) sys.exit(1) args.chunk_itempfiles.append(chunk_tempfile) #store temp filenames myoutputfh.close() delete_files(args.chunk_itempfiles) #for chunknum in range(args.chunks_count): #DEBUG! # eprint(" >> the temporary index file for chunk #{} will be '{}'".format( # chunknum, args.chunk_itempfiles[chunknum])) #if outff needs to be generated if args.compress or args.encrypt: args.chunk_otempfiles = list() for chunknum in range(args.chunks_count): chunknumstr = str(chunknum).zfill(len(str( args.chunks_count))) #e.g. 00, 01.. chunk_tempfile = args.mt_subfiles_oprefix + "." + chunknumstr try: myoutputfh = open(chunk_tempfile, 'w') except PermissionError: eprint( " => ERROR: Cannot open temporary file '{}' for writing" .format(chunk_tempfile)) sys.exit(1) args.chunk_otempfiles.append( chunk_tempfile) #store temp filenames myoutputfh.close() delete_files(args.chunk_otempfiles)
def check_iofiles(read_filenames, write_filenames): """ check for ability to open input/output filenames """ if read_filenames is not None: for filename in read_filenames: try: inputfh = open(filename, 'r') inputfh.close() except FileNotFoundError: eprint( " => ERROR: Cannot open file '{}' for reading".format( filename)) sys.exit(2) if write_filenames is not None: for filename in write_filenames: if os.path.isfile(filename): eprint(" => ERROR: file '{}' exists".format(filename)) eprint(" please remove it as we refuse to overwrite it!") sys.exit(1) try: myoutputfh = open(filename, 'w') myoutputfh.close() except PermissionError: eprint( " => ERROR: Cannot open file '{}' for writing".format( filename)) sys.exit(1) #eprint("deleting {}".format(filename)) #DEBUG delete_files([filename])
def print_stats(start_time): """ if verbose print some final statistics on entries indexed """ entries_sum = 0 indexes_sum = 0 skipped_sum = 0 for chunknum in range(args.chunks_count): entries_sum += entries_count[chunknum] indexes_sum += indexes_count[chunknum] skipped_sum += skipped_count[chunknum] eprint(" '-- {} entries with {} indexes ~ {} entries skipped --'".format( entries_sum, indexes_sum, skipped_sum)) eprint(" '-- Elapsed: {}, {} entries/sec --'".format( *elapsed_time(start_time, entries_sum)))
def test_format_indexes(self): """ Check for correct parsing of indexes """ cipher_type = "A" for try_index_type in ("-", ".", ":", "+"): new_indexes = ffdb.format_indexes(entry, try_index_type, cipher_type, checksums=False) assert len(new_indexes) == len(entry['ids']) firstline = new_indexes[0].rstrip() ffdb.eprint("index_type: '{}', index_line: '{}'".format( try_index_type, firstline)) index_type, cipher_name, keysize, _ = ffdb.check_index(firstline) assert try_index_type == index_type if cipher_name is not None: assert cipher_type == ffdb.get_cipher_type(cipher_name)
def check_files(): """ do some checks on availability of resources """ check_iofiles([args.flatfile], []) check_iofiles([], [args.output_filename, args.outindex_filename]) if args.threads > 1: #multithread os.mkdir(args.mt_subfiles_dir, mode=0o700) args.chunk_itempfiles = list() for chunknum in range(args.chunks_count): chunknumstr = str(chunknum).zfill(len(str( args.chunks_count))) #e.g. 00, 01.. chunk_tempfile = args.mt_subfiles_iprefix + "." + chunknumstr try: myoutputfh = open(chunk_tempfile, 'w') except PermissionError: eprint( " => ERROR: Cannot open temporary file '{}' for writing" .format(chunk_tempfile)) sys.exit(1) args.chunk_itempfiles.append(chunk_tempfile) #store temp filenames myoutputfh.close() delete_files(args.chunk_itempfiles) args.chunk_otempfiles = list() for chunknum in range(args.chunks_count): chunknumstr = str(chunknum).zfill(len(str( args.chunks_count))) #e.g. 00, 01.. chunk_tempfile = args.mt_subfiles_oprefix + "." + chunknumstr try: myoutputfh = open(chunk_tempfile, 'w') except PermissionError: eprint( " => ERROR: Cannot open temporary file '{}' for writing" .format(chunk_tempfile)) sys.exit(1) args.chunk_otempfiles.append(chunk_tempfile) #store temp filenames myoutputfh.close() delete_files(args.chunk_otempfiles)
def test_ciphers(self): global SALT SALT = b'5ed3a4284d6a9c1e4e4f6b4729b254be' passphrase = "The quick brown fox jumps over the lazy dog" iv = ffdb.generate_iv() ffdb.eprint("Testing aes128") keysize = 16 cipher_name, key = ffdb.derive_key(passphrase, keysize) cipher_type = ffdb.get_cipher_type(cipher_name) assert cipher_name == "aes128" assert key == b'c\x05i\xa5\x81c`\x8e(\xa4\xd3CR\xc9\xb0\xf1' assert cipher_type == "A" ffdb.eprint("Testing aes192") keysize = 24 cipher_name, key = ffdb.derive_key(passphrase, keysize) cipher_type = ffdb.get_cipher_type(cipher_name) assert cipher_name == "aes192" assert key == b'c\x05i\xa5\x81c`\x8e(\xa4\xd3CR\xc9\xb0\xf1\x86L\x7f=a\xd3\x8cw' assert cipher_type == "B" ffdb.eprint("Testing aes256") keysize = 32 cipher_name, key = ffdb.derive_key(passphrase, keysize) cipher_type = ffdb.get_cipher_type(cipher_name) assert cipher_name == "aes256" assert key == b'c\x05i\xa5\x81c`\x8e(\xa4\xd3CR\xc9\xb0\xf1\x86L\x7f=a\xd3\x8cw\xadc:\x899\xfe^\xfe' assert cipher_type == "C" ffdb.eprint("Testing encryption/decryption") for keysize in (16, 24, 32): cipher_name, key = ffdb.derive_key(passphrase, keysize) cipher = ffdb.init_cipher(key, iv) encrypted_data = cipher.encrypt(data.encode('UTF-8')) compressed_encrypted_data = cipher.encrypt( ffdb.deflate(data.encode('UTF-8'), 9)) cipher = ffdb.init_cipher(key, iv) decrypted_data = cipher.decrypt(encrypted_data).decode('UTF-8') decrypted_uncompressed_data = ffdb.inflate( cipher.decrypt(compressed_encrypted_data)).decode('UTF-8') assert decrypted_data == data assert decrypted_uncompressed_data == data
def print_stats(start_time): """ print some final statistics on entries deleted """ indexes_sum = 0 deleted_sum = 0 reindexed_sum = 0 for chunknum in range(args.chunks_count): indexes_sum += indexes_count[chunknum] reindexed_sum += reindexed_count[chunknum] deleted_sum += deleted_count[chunknum] if found_count == 1: eprint(" |-- Found and removed 1 entry.") elif found_count > 0: eprint(" |-- Found and removed {} entries.".format(found_count)) if found_count < requested_count: if found_count == 0: eprint( " => WARNING: NONE of the {} requested identifiers found in index!" .format(requested_count)) else: eprint( " => WARNING: only {} of the {} requested identifiers found in index." .format(found_count, requested_count)) eprint(" |-- Deleted {} and reindexed {} indexes out of total {}.".format( deleted_sum, reindexed_sum, indexes_sum)) _, reindexed_speed = elapsed_time(start_time, reindexed_sum) eprint(" '-- Elapsed: {}, {} deletions/sec; {} reindexing/sec --'".format( *elapsed_time(start_time, found_count), reindexed_speed))
def check_args(): """ parse arguments and check for error conditions """ global args usagetxt = """{0} -f FLATFILE -i INDEXFILE -l LISTFILE [-o OUTPATH] [-f] : flatfile from which the entries should be removed [-i] : index of FLATFILE [-l] : file with list of identifers for the entries that should be removed see {0} -h for tweaks and optional modes \nexamples: {0} -f entries.dat -i entries.pos -l removeme.list (will create entries.dat.new and entries.pos.new) {0} -f entries.dat -i entries.pos -l removeme.list -o cleaned (will create cleaned/entries.dat.new and cleaned/entries.pos.new) """.format(PROGNAME) parser = argparse.ArgumentParser( description='Use a positional index to delete \ entries from a flatfile', usage=usagetxt) parser.add_argument( '-f', '--file', dest='flatfile', help="filename of flatfile from which entries should be deleted", required=True, type=str) parser.add_argument( '-i', '--index', dest='index_filename', help="filename of index file containing entry identifiers", required=True, type=str) parser.add_argument( '-l', '--list', dest='list_filename', help="a file containing a list of identifiers corresponding to entries \ to delete", required=True, type=str) parser.add_argument( '-o', '--outpath', dest='outpath', help="write new files to specified path rather than creating \ new files in the same location as the original ones", required=False, type=str) parser.add_argument('-v', '--verbose', dest='verbose', action='store_true', help="verbose operation", required=False) parser.add_argument( '-d', '--duplicates', dest='duplicates', action='store_true', help= "specify INDEX_FILE could contain duplicate identifiers and request \ deletion of all of them (default is to delete a single entry)", required=False) parser.add_argument( '-z', '--zfound', dest='zfound', action='store_true', help="specify INDEX_FILE contains duplicate identifiers and request \ deletion of last entry appearing in the flatfile (default is the first)", required=False) parser.add_argument( '-t', '--threads', dest='threads', help="use specified number of multiple threads for parallel reindexing", required=False, type=int) parser.add_argument( '-b', '--blocksize', dest='index_blocksize', help="redefine blocksize used for parallel execution. By default \ it will be adjusted automatically to the number of threads", required=False, type=siprefix2num) args = parser.parse_args() randnum = str(randint(1000, 9999)) args.progressbar = False if args.verbose: eprint(" .-- {} v{} -- by {} --.".format(PROGNAME, VERSION, AUTHOR)) args.progressbar = True if args.zfound and args.duplicates: eprint( " => ERROR: No sense specifying both --zfound and --duplicates at the same time" ) if args.verbose: if args.zfound: eprint(" |-- [-z] option selected:") eprint( " | if duplicates in index, the entry appearing last in ff will be deleted" ) elif args.duplicates: eprint(" |-- [-d] option selected:") eprint( " |-- if duplicates in index, all corresponding entries in ff will be deleted" ) else: eprint( " |-- if duplicates in index, the entry appearing first in ff will be deleted" ) eprint(" | you can change this behaviour with [-z] or [-d]") if args.flatfile[-3:] == ".gz": eprint(" => ERROR: -f argument has extension .gz; wrong file??") sys.exit(22) if args.flatfile.find("://") != -1 and not args.remote: eprint(" => NOTICE: -f argument appears to be an URL; wrong file??") sys.exit(22) if args.outpath is None: args.outindex_filename = args.index_filename + ".new" args.output_filename = args.flatfile + ".new" else: if not os.access(args.outpath, os.W_OK): eprint( " => ERROR: specified outpath '{}' doesn't exist or is not writable!" .format(args.outpath)) sys.exit(1) args.outindex_filename = os.path.join(args.outpath, args.index_filename + ".new") args.output_filename = os.path.join(args.outpath, args.flatfile + ".new") if args.verbose: eprint(" |-- updated flatfile and index will be '{}' '{}'".format( args.output_filename, args.outindex_filename)) #gather information from first line of index args.encrypted = False with open(args.index_filename, 'r', 1) as indexfh: args.index_type, _, _, _ = check_index(indexfh.readline()) if args.index_type in (".", "+"): args.encrypted = True if args.verbose: eprint(" |-- index made for encrypted entries") eprint( " `=> Please ensure the -f filename points to the encrypted flatfile" ) if args.index_type in (":", "+"): if args.verbose: eprint(" |-- index made for compressed entries") eprint( " `=> Please ensure the -f filename points to the compressed flatfile" ) if args.index_blocksize is not None: if args.threads is None: eprint( " => ERROR: specifying blocksize makes sense only for -t execution" ) sys.exit(22) if args.verbose: eprint(" |-- blocksize set to {} bytes".format( args.index_blocksize)) if args.threads is not None: #multithread if args.threads < 2: eprint( " => ERROR: No sense specifying a number of threads lower than 2!" ) sys.exit(22) if args.index_blocksize is None: #if not specified, we use 1/threadnumTH of filesize up to a minimum MINBLOCKSIZE args.index_blocksize = max( calculate_blocksize(args.index_filename, args.threads), siprefix2num(MINBLOCKSIZE)) args.mt_subfiles_dir = TEMPDIR + "/tmpREINDEX" + randnum + "/" args.mt_subfiles_iprefix = args.mt_subfiles_dir + "I" args.mt_subfiles_oprefix = args.mt_subfiles_dir + "O" args.list_filesize, args.chunks_count = calculate_chunknum( args.index_filename, args.index_blocksize) if args.verbose: eprint( " |-- Parallel work in {} chunks of maximum {} bytes (-b to change)" .format(args.chunks_count, args.index_blocksize)) eprint( " |-- using maximum {} parallel threads (-t); your OS reports {} cpus." .format(args.threads, os.cpu_count())) else: # if unspecified, set args.threads to 1 args.threads = 1 args.chunks_count = 1
#2) delete entries from flatfile delete_entries(args.flatfile, args.output_filename, mysorted_positions, myposition2size) #3) fill dict with cumulative offsets, used to update index myposition2offset = dict() size_offset = 0 for myentry_position in mysorted_positions: size_offset += myposition2size[myentry_position] myposition2offset[myentry_position] = size_offset #eprint("removed a total of {} bytes".format(size_offset)) #debug if os.path.getsize(args.flatfile) - size_offset != os.path.getsize( args.output_filename): eprint( " => ERROR: problems with deletion, file size of resulting file is wrong" ) sys.exit(1) #4) update the index shifting positions, optionally with multithread if args.threads > 1: #multithread if sys.version_info[1] > 7: #from py3.8 set_start_method('fork') #spawn not implemented args.chunk_itempfiles, _ = split_file(args.index_filename, args.index_blocksize, args.mt_subfiles_iprefix) args.chunks_count = len(args.chunk_itempfiles) if args.verbose: eprint( " |-- parallel reindexing in chunks of maximum {} bytes (-b to change)"
def check_args(): """ parse arguments and check for error conditions """ global args, GZTOOL_EXE usagetxt = """{0} -f FLATFILE -i INDEXFILE -e ENTRIESFILE -n NEWINDEXFILE [-f] : flatfile into which the new entries should be added [-i] : index of FLATFILE [-e] : filename containing the new entries to be added [-n] : index of ENTRIESFILE see {0} -h for tweaks and optional modes \nexamples: {0} -f db.dat -i db.pos -e new.dat -n new.pos (will update db.dat and db.pos) {0} -c -f db.dat -i db.pos -e new.dat -n new.pos (will create db.dat.new and db.pos.new) {0} -c -o export -f db.dat -i db.pos -e new.dat -n new.pos (will create export/db.dat.new and export/db.pos.new) """.format(PROGNAME) parser = argparse.ArgumentParser( description='Merge new pre-indexed entries into an existing \ flatfile', usage=usagetxt) parser.add_argument('-f', '--file', dest='ff_filename', help="filename of flatfile to be processed", required=True, type=str) parser.add_argument('-i', '--index', dest='index_filename', help="filename of index file with entry identifiers", required=True, type=str) parser.add_argument( '-e', '--entries', dest='newentries_filename', help="filename of new entries to be merged into flatfile", required=True, type=str) parser.add_argument('-n', '--newindex', dest='newindex_filename', help="filename of index file with entry identifiers", required=True, type=str) parser.add_argument( '-c', '--create', dest='createmode', action='store_true', help="create new files (.new extension) rather than updating existing \ files (the default operation mode)", required=False) parser.add_argument( '-o', '--outpath', dest='outpath', help="optionally write new files to specified path rather than creating \ new files in the same location as the original ones", required=False, type=str) parser.add_argument('-v', '--verbose', dest='verbose', action='store_true', help="verbose operation", required=False) parser.add_argument( '-d', '--delete', dest='deleteafter', action='store_true', help="delete ENTRIESFILE and NEWINDEXFILE after merging is completed", required=False) parser.add_argument( '-g', '--gzip', dest='gzip', action='store_true', help="compress the final flatfile after merge, creating .gzi compressed \ index", required=False) parser.add_argument( '-s', '--small', dest='smallnew', action='store_true', help= "use this mode if the new index is small (<30k entries): performance \ should be better", required=False) args = parser.parse_args() if args.verbose: eprint(" .-- {} v{} -- by {} --.".format(PROGNAME, VERSION, AUTHOR)) args.ff_compressed = False if args.ff_filename[-3:] == ".gz": args.ff_compressed = True if not args.gzip: eprint( " => NOTICE: -f argument has extension .gz: assuming flatfile is compressed" ) eprint( " it will be uncompressed and then recompressed after merge" ) args.gzip = True args.newentries_compressed = False if args.newentries_filename[-3:] == ".gz": args.newentries_compressed = True eprint( " => NOTICE: -n argument has extension .gz: assuming newentriesfile compressed" ) if args.ff_filename.find("://") != -1: eprint(" => ERROR: {} cannot operate on remote flatfiles".format( PROGNAME)) sys.exit(22) if args.smallnew: if args.verbose: eprint(" |-- using tweak for smaller new index") else: randnum = str(randint(1000, 9999)) args.itemp_filename = TEMPDIR + "/tmpMERGE" + randnum if args.outpath is None: args.outindex_filename = args.index_filename + ".new" else: if not os.access(args.outpath, os.W_OK): eprint( " => ERROR: specified outpath '{}' doesn't exist or is not writable!" .format(args.outpath)) sys.exit(1) args.outindex_filename = os.path.join(args.outpath, args.index_filename + ".new")
if __name__ == '__main__': #parse and check arguments check_args() #check files (if they can be read/written) check_files() #check if old and new indexes are of same type: check_index_compatibility(args.index_filename, args.newindex_filename) #uncompress files if needed if args.ff_compressed: if args.verbose: eprint( " |-- uncompressing original flatfile.. this may take some time.." ) args.ff_filename = uncompress_file(args.ff_filename) if args.newentries_compressed: if args.verbose: eprint( " |-- uncompressing newentries file.. this may take some time.." ) args.newentries_filename = uncompress_file(args.newentries_filename) #calculate index offset pos_offset = os.path.getsize(args.ff_filename) #merge old and new identifiers' indexes if args.smallnew: tempfile = None
def test_deflate(self): bytestring = data.encode('UTF-8') deflated = ffdb.deflate(bytestring, 9) inflated = ffdb.inflate(deflated).decode('UTF-8') ffdb.eprint("testing inflate/deflate of data") assert inflated == data, "problems with compression or uncompression"
def check_args(): """ parse arguments and check for error conditions """ global args, patterns, joinedpatterns usagetxt = """{0} -f FLATFILE -i 'PATTERN' [-e ENDPATTERN] >INDEXFILE [-f] : flatfile to index [-i] : regex pattern for the identifiers; also [-j], see examples below [-e] : pattern for end of entry. defaults to "^-$" see '{0} -h' for tweaks and optional modes \nnotes: If compression or encryption is requested, an output flatfile will be created, and the resulting index will refer to it. If the identifiers are a LOT and memory is an issue, you may wish to use [-u] option and sort the resulting index after it has been generated.\nexamples: {0} -i '^AC (.+?);' -f uniprot.dat -e '^//$' >up.pac {0} -i '^AC (.+?);' 'ID (.+?);' -f [...] (multiple patterns can be specified) {0} -i '^AC (.+?);' -j '^OX NCBI_(Tax)ID=(\\d+) ' -f [...] (complex patterns made of multiple parts can be specified with [-j]; -i and -j patterns can be used together) {0} -a -j '^DR (.+?);( .+?);' -f [...] {0} -a -i '^AC (\\S+?); ?(\\S+)?;? ?(\\S+)?;?' -f [...] (use [-a] option to find all instances and capture groups of the provided patterns, not just the first one) """.format(PROGNAME) parser = argparse.ArgumentParser( description='Create a positional index for any flatfile, \ optionally compressing or encrypting its entries', usage=usagetxt) parser.add_argument('-f', '--file', dest='input_filename', help="Filename of flatfile to be processed", required=True) parser.add_argument('-i', '--id', dest='patterns', help="regexp pattern for identifier(s) to index", required=False, type=str, nargs='+') parser.add_argument('-j', '--joinedid', dest='joinedpatterns', help="regexp pattern for identifier(s) to index", required=False, type=str, nargs='+') parser.add_argument( '-e', '--endpattern', dest='terminator', help="regexp pattern to identify the end of each entry. If unspecified \ it defaults to '^-$'", required=False) parser.add_argument( '-a', '--allmatches', dest='allmatches', action='store_true', help= "find all instances of the identifier pattern, not just the first one \ (the default behaviour)", required=False) parser.add_argument('-v', '--verbose', dest='verbose', action='store_true', help="verbose operation", required=False) parser.add_argument( '-t', '--threads', dest='threads', help="use specified number of threads for parallel indexing", required=False, type=int) parser.add_argument( '-b', '--blocksize', dest='input_blocksize', help="redefine blocksize used for parallel execution. By default \ it will be adjusted automatically to the number of threads", required=False, type=siprefix2num) parser.add_argument( '-o', '--offset', dest='pos_offset', help="optional offset (in bytes) to shift entry positions in index", required=False, type=int) parser.add_argument( '-k', '--keysize', dest='keysize', help="request entries to be encrypted and specify encryption strength: \ 16=aes-128, 24=aes-192 or 32=aes-256. INPUT_FILENAME.enc will be created", required=False, type=int, choices=(16, 24, 32)) parser.add_argument( '-p', '--passphrase', dest='passphrase', help="passphrase for encrypting the entries; if unspecified it will be \ requested interactively (safer)", required=False, type=str) parser.add_argument( '-c', '--compresslevel', dest='compresslevel', help="request entries to be compressed and specify a compress level. \ INPUT_FILENAME.xz will be created", required=False, type=int, choices=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9)) parser.add_argument('-x', '--xsanity', dest='xsanity', action='store_true', help="compute entry checksums and add them to index", required=False) parser.add_argument( '-u', '--unsorted', dest='unsorted', action='store_true', help="do not sort the index, leaving that task to a followup external \ command. Note that extraction requires a sorted index", required=False) parser.add_argument( '-n', '--nopos', dest='nopos', action='store_true', help="do not compute positions, just print matching identifiers", required=False) args = parser.parse_args() randnum = str(randint(1000, 9999)) if args.patterns is None and args.joinedpatterns is None: eprint(" => ERROR: at least one of -i or -j needs to be given!") sys.exit(22) if args.patterns is None: args.patterns = [] if args.joinedpatterns is None: args.joinedpatterns = [] args.progressbar = False if args.verbose: eprint(" .-- {} v{} -- by {} --.".format(PROGNAME, VERSION, AUTHOR)) args.progressbar = True if args.allmatches: if args.verbose: eprint( " |-- All matches of the pattern will be stored as identifiers" ) if args.nopos: if args.pos_offset is not None or args.compresslevel is not None \ or args.passphrase is not None or args.keysize is not None or \ args.xsanity: eprint( " => ERROR: No sense specifying compression, encryption, sanity" ) eprint(" => or pos_offset when using --nopos option") sys.exit(22) if args.verbose: eprint( " |-- No positional index will be created. Only printing identifiers found" ) if args.pos_offset is None: args.pos_offset = 0 else: if args.verbose: eprint(" |-- positions to be offset by: {}".format( args.pos_offset)) if args.unsorted: if args.verbose: eprint( " => NOTICE: index will be printed unsorted as requested.") eprint(" Please sort index before using it for extraction") patterns = set() for pattern in args.patterns: if args.verbose: eprint(" |-- adding identifier pattern '{}'".format(pattern)) patterns.add(re.compile(pattern.encode('UTF-8'), re.MULTILINE)) joinedpatterns = set() for pattern in args.joinedpatterns: if args.verbose: eprint( " |-- adding joined identifier pattern '{}'".format(pattern)) joinedpatterns.add(re.compile(pattern.encode('UTF-8'), re.MULTILINE)) if args.terminator is None: args.terminator = "^-$" #default if args.verbose: eprint(" |-- entry terminator pattern set as '{}'".format( args.terminator)) if args.xsanity and args.verbose: eprint(" |-- entry checksums will be computed and added to index") if args.keysize is not None: if args.passphrase is None: eprint(" |-- keysize specified, please provide a passphrase:") args.passphrase = getpass.getpass(prompt=" |>> Passphrase: ") args.encrypt = False args.cipher_type = None args.key = None if args.passphrase is not None: args.encrypt = True if args.keysize is None: args.keysize = 16 #default args.cipher_name, args.key = derive_key(args.passphrase, args.keysize) args.cipher_type = get_cipher_type(args.cipher_name) if args.verbose: eprint( " |-- encrypted flatfile (with cipher {}) will be written to '{}.enc'" .format(args.cipher_name, args.input_filename)) #eprint(" |-- the passphrase is: {}".format(args.passphrase)) #DEBUG! #eprint(" |-- the encryption key is: {}".format(args.key)) #DEBUG! args.compress = False if args.compresslevel is not None: args.compress = True if args.encrypt: args.index_type = "+" #both compressed and encrypted if args.verbose: eprint( " |-- entries will be compressed and encrypted to .enc file" ) else: args.index_type = ":" #compressed but not encrypted if args.verbose: eprint(" |-- entries will be compressed to .xz file") else: #not compressed if args.encrypt: args.index_type = "." #encrypted but not compressed if args.verbose: eprint(" |-- entries will be encrypted to .enc file") else: args.index_type = "-" #not encrypted nor compressed but entry sizes stored if args.input_blocksize is not None: if args.threads is None: eprint( " => ERROR: specifying blocksize makes sense only for -t execution" ) sys.exit(22) if args.verbose: eprint(" |-- blocksize set to {} bytes".format( args.input_blocksize)) if args.threads is not None: #multithread if args.threads < 2: eprint( " => ERROR: No sense specifying a number of threads lower than 2!" ) sys.exit(22) if args.input_blocksize is None: #if not specified, we use 1/threadnumTH of inputfilesize up #to a maximum MAXBLOCKSIZE and with minimum MINBLOCKSIZE args.input_blocksize = max( siprefix2num(MINBLOCKSIZE), min(calculate_blocksize(args.input_filename, args.threads), siprefix2num(MAXBLOCKSIZE))) args.input_filesize = os.path.getsize(args.input_filename) if args.input_blocksize > args.input_filesize // 2 * 3: eprint( " => NOTICE: blocksize too BIG compared to flatfile size, -t not applicable!" ) sys.exit(22) args.mt_subfiles_dir = TEMPDIR + "/tmpINDEX" + randnum + "/" args.mt_subfiles_fprefix = args.mt_subfiles_dir + "F" args.mt_subfiles_iprefix = args.mt_subfiles_dir + "I" args.mt_subfiles_oprefix = args.mt_subfiles_dir + "O" #find max number of chunks required (we'll adjust later on split) args.input_filesize, args.chunks_count = calculate_chunknum( args.input_filename, args.input_blocksize) if args.verbose: eprint( " |-- using maximum {} parallel threads (-t); your OS reports {} cpus." .format(args.threads, os.cpu_count())) else: # if unspecified, set args.threads to 1 args.threads = 1 args.chunks_count = 1 if args.nopos: args.index_type = "" #no indexes
if sys.version_info[1] > 7: #from py3.8 set_start_method('fork') #spawn not implemented args.chunk_ftemp_files = list() #find out where to split the input file without breaking entries args.chunk_ftemp_startpos, args.chunk_ftemp_filesizes = compute_split_positions( args.input_filename, args.input_blocksize, args.terminator) args.chunks_count = len(args.chunk_ftemp_filesizes) suffixlength = len(str(args.chunks_count)) for mychunknum in range(args.chunks_count): chunk_suffix = str(mychunknum).zfill(suffixlength) args.chunk_ftemp_files.append(args.mt_subfiles_dir + chunk_suffix) if args.verbose: eprint( " |-- parallel work in chunks of maximum {} bytes (-b to change)" .format(args.input_blocksize)) eprint(" |-- flatfile will be split into {} chunks".format( args.chunks_count)) entries_count = Array('i', args.chunks_count) indexes_count = Array('i', args.chunks_count) skipped_count = Array('i', args.chunks_count) args.chunk_itempfiles = args.chunk_itempfiles[0:args.chunks_count] if args.outff_filename is not None: args.chunk_otempfiles = args.chunk_otempfiles[0:args.chunks_count] #init threads pool = Pool(args.threads, initializer=init_thread,