def process_archive_retrieval_job(job,chunk_size,output_path,friendly_name=False): global chunk_count filepos_limit = job.archive_size_in_bytes - 1 pad_length = len(str(job.archive_size_in_bytes // chunk_size)) + 1 current_pos = 0 job_archive_hash = job.archive_sha256_tree_hash chunk_count = 0 failed_parts = {} running_treehash = TreeHash(algo=hashlib.sha256) try: while current_pos < filepos_limit: chunk_count += 1 end_pos = current_pos + (chunk_size - 1) if end_pos > filepos_limit: end_pos = filepos_limit if not process_archive_retrieval_range(job, output_path, current_pos, end_pos, friendly_name, running_treehash): failed_parts["part_" + str(chunk_count).zfill(pad_length)]=[current_pos, end_pos] current_pos = end_pos + 1 if (not failed_parts) and (running_treehash.hexdigest() != job.archive_sha256_tree_hash): failed_parts = None failed_parts["all"] = True except: failed_parts["all"] = True raise return failed_parts
def sha256_on_file_range(filename, start, end, hash_chunk_size=DEFAULT_HASH_CHUNK_SIZE): sha256 = hashlib.sha256() infile = open(filename, "rb") treehash = TreeHash(algo=hashlib.sha256) infile.seek(start) if Debug: print("Running Hash: Start: " + str(start) + ", End: " + str(end)) current_pos = start end += 1 while current_pos < end: read_size = end - current_pos if read_size > hash_chunk_size: read_size = hash_chunk_size chunk = infile.read(read_size) sha256.update(chunk) treehash.update(chunk) current_pos += read_size infile.close() if Debug: print("Running hash for this section (" + str(start) + " to " + str(end) + ") is " + sha256.hexdigest()) print("Tree hash for this section (" + str(start) + " to " + str(end) + ") is " + treehash.hexdigest()) return sha256.hexdigest()
def treehash_on_file_range(treehash, filename, start, end, hash_chunk_size=DEFAULT_HASH_CHUNK_SIZE): infile = open(filename, "rb") infile.seek(start) if Debug: print("Treehash: Start: " + str(start) + ", End: " + str(end)) treehash_local = TreeHash(algo=hashlib.sha256) current_pos = start end += 1 while current_pos < end: read_size = end - current_pos if read_size > hash_chunk_size: read_size = hash_chunk_size chunk = infile.read(read_size) if treehash: treehash.update(chunk) treehash_local.update(chunk) current_pos += read_size infile.close() return treehash_local.hexdigest()
def test_update(self): tree_hash = TreeHash(TEST_CHUNK) tree_hash.update(TEST_DATA, TEST_INDEX) self.assertEqual( hashlib.sha256(TEST_DATA).hexdigest(), tree_hash.hexdigest() )
def main(): global Debug # Parse command line options parser = argparse.ArgumentParser() parser.add_argument("--file", help="File Name", required=True) parser.add_argument("--rangesize", help="Size of the range", required=True) parser.add_argument("--chunksize", help="Size of the chunks to use for checksumming.", default=DEFAULT_CHUNK_SIZE) parser.add_argument("--debug", help="Print Debug messages", action="store_true") args = parser.parse_args() Debug = args.debug treehash = TreeHash(algo=hashlib.sha256) statinfo = os.stat(args.file) end_pos = statinfo.st_size range = int(args.rangesize) chunksize = int(args.chunksize) current_pos = 0 while current_pos < end_pos: current_end_pos = current_pos + range if current_end_pos > end_pos: current_end_pos = end_pos running_treehash_on_file_range(treehash, args.file, current_pos, current_end_pos, chunksize) current_pos = current_end_pos print("TreeHash for " + args.file + " is " + treehash.hexdigest())
def test_digest(self): tree_hash = TreeHash(TEST_CHUNK, algo=hashlib.md5) tree_hash.update(TEST_DATA, TEST_INDEX) self.assertEqual( hashlib.md5(TEST_DATA).digest(), tree_hash.digest() )
def test_update(self): treehash = TreeHash() treehash.update(TEST_DATA) self.assertEqual( hashlib.sha256(TEST_DATA).hexdigest(), treehash.hexdigest() )
def hash_file(fname): treehash = TreeHash(algo=hashlib.sha256) with open(fname, 'rb') as f: while True: buf = f.read(1024 * 1024) if not buf: break treehash.update(buf) return treehash.hexdigest()
def test_tree(self): hashlib_result = hashlib.sha256( hashlib.sha256(TEST_DATA).digest() + hashlib.sha256(TEST_DATA).digest() ).hexdigest() tree_hash = TreeHash(2, block_size=1) tree_hash.update(TEST_DATA, 0) tree_hash.update(TEST_DATA, 1) self.assertEqual(hashlib_result, tree_hash.hexdigest())
def process_archive_retrieval_job(job,chunk_size,output_path,friendly_name=False): global chunk_count filepos_limit = job.archive_size_in_bytes - 1 current_pos = 0 job_archive_hash = job.archive_sha256_tree_hash chunk_count = 0 archive_file_name = output_path + "/" + job.id + ".archive" archive_file = open(archive_file_name, "wb") treehash = TreeHash(algo=hashlib.sha256) while current_pos < filepos_limit: end_pos = current_pos + (chunk_size - 1) if end_pos > filepos_limit: end_pos = filepos_limit range_string = "bytes=" + str(current_pos) + "-" + str(end_pos) response = job.get_output( range=range_string ) if Debug: print("process_archive_retrieval_job: job.get_output() response: " + str(response)) if HTTP_SUCCESS_LOW <= response['status'] <= HTTP_SUCCESS_HIGH: chunk_count += 1 if Debug: #print("Writing chunk " + str(chunk_count) + " " + range_string + " Checksum: " + response['checksum'] + " ContentRange: " + response['contentRange'] + " AcceptRanges: " + response['acceptRanges'] + " ContentType: " + response['contentType'] + " ArchiveDescription: " + response['archiveDescription']) print("Writing chunk " + str(chunk_count) + " " + range_string + " Checksum: " + response['checksum']) #archive_file.write(response['body'].read()) chunk_bytes=response['body'].read() archive_file.write(chunk_bytes) if Debug: chunk_file = open(archive_file_name + ".chunk." + str(chunk_count), "wb") chunk_file.write(chunk_bytes) chunk_file.close section_hash = sha256_on_file_range(archive_file_name, current_pos, end_pos) running_treehash_on_file_range(treehash, archive_file_name, current_pos, end_pos) if Debug: print("Local checksum of chunk " + str(chunk_count) + ": " + section_hash) print("Current running treehash is " + treehash.hexdigest()) current_pos = end_pos + 1 else: print("Response unsuccessful. Retrying") archive_file.close
def main(): global Debug # Parse command line options parser = argparse.ArgumentParser() parser.add_argument("--file", help="file to use for hash tests", required=True) parser.add_argument("--outputpath", help="Path to store output", default=DEFAULT_OUTPUT_PATH) parser.add_argument("--chunksize", help="Size of the chunks to use for download. Only valid of the job is ArchiveRetrieval.", default=DEFAULT_CHUNK_SIZE) parser.add_argument("--debug", help="Print Debug messages", action="store_true") args = parser.parse_args() Debug = args.debug ProcessJob_old.set_debug(Debug) chunksize = int(args.chunksize) statinfo = os.stat(args.file) if not ProcessJob_old.is_power_of_2(chunksize): print("Chunksize " + str(chunksize) + " is not a power of two. The next closest power of two is " + str(ProcessJob_old.next_power_of_2(chunksize))) print("Exiting.") sys.exit(1) if chunksize > statinfo.st_size: chunksize = statinfo.st_size current_pos = 0 chunk_count = 0 treehash = TreeHash(algo=hashlib.sha256) while current_pos < statinfo.st_size: chunk_count += 1 end_pos = current_pos + chunksize - 1 if end_pos > statinfo.st_size: end_pos = statinfo.st_size if Debug: print("Processing chunk " + str(chunk_count) + " range " + str(current_pos) + " to " + str(end_pos) ) section_hash = ProcessJob_old.sha256_on_file_range(args.file, current_pos, end_pos) ProcessJob_old.running_treehash_on_file_range(treehash, args.file, current_pos, end_pos) print("Range: " + str(current_pos) + " to " + str(end_pos)) print("Local checksum of chunk " + str(chunk_count) + ": " + section_hash) print("Current running treehash is " + treehash.hexdigest()) current_pos = end_pos + 1
def run(self): client = self.get_boto_client() logging.info("Initiating job to upload") upload_job = client.initiate_multipart_upload( vaultName=self.vault, archiveDescription=self.file_name, partSize=str(UPLOAD_PART_SIZE)) upload_id = upload_job['uploadId'] treehash = TreeHash(block_size=1024**2) cur_file = open(self.file_location, 'rb') i = 0 for i in range(self.numof_parts - 1): self.cur_part += 1 self.update_task() data = cur_file.read(UPLOAD_PART_SIZE) treehash.update(data) cur_range = 'bytes %d-%d/*' % (i * UPLOAD_PART_SIZE, (i + 1) * UPLOAD_PART_SIZE - 1) client.upload_multipart_part(vaultName=self.vault, uploadId=upload_id, range=cur_range, body=data) self.cur_part += 1 self.update_task() data = cur_file.read(UPLOAD_PART_SIZE) treehash.update(data) cur_range = 'bytes %d-%d/*' % (i * UPLOAD_PART_SIZE, self.file_size - 1) client.upload_multipart_part(vaultName=self.vault, uploadId=upload_id, range=cur_range, body=data) cur_file.close() hash_res = treehash.hexdigest() client.complete_multipart_upload(vaultName=self.vault, uploadId=upload_id, archiveSize=str(self.file_size), checksum=hash_res)
def test_tree(self): hashlib_result = hashlib.sha256( hashlib.sha256(TEST_DATA).digest() + hashlib.sha256(TEST_DATA).digest()).hexdigest() self.assertEqual(hashlib_result, TreeHash(2 * TEST_DATA, block_size=1).hexdigest())
def test_update(self): treehash = TreeHash() treehash.update(TEST_DATA) self.assertEqual( hashlib.sha256(TEST_DATA).hexdigest(), treehash.hexdigest())
def handle(self, *args, **options): part_size = 8388608 print 'Contacting Amazon AWS...' glacier = boto3.client('glacier') multipart_upload = glacier.initiate_multipart_upload( vaultName=settings.GLACIER_VAULT_NAME, partSize=str(part_size)) print 'Connected to Glacier Vault "' + settings.GLACIER_VAULT_NAME + '"' upload_id = multipart_upload['uploadId'] treehash_archive = TreeHash() db = influxdb.InfluxDBClient(settings.INFLUXDB_URI, 8086, 'root', 'root', 'seads') archive_size = 0 for device in Device.objects.all(): start = datetime.fromtimestamp(0) end = datetime.now() - timedelta(days=31 * device.data_retention_policy) start = (datetime.now() - start).total_seconds() start = 0 end = int((datetime.now() - end).total_seconds()) end = time.time() - end print 'Trying ' + str(device) + '...' print 'Data Retention Policy: ' + str( device.data_retention_policy) + ' Months' series = 'device.' + str(device.serial) try: query = 'select * from ' + series + ' where time > ' + str( start) + 's and time < ' + str(end) + 's' points = db.query(query) except: print 'No data found for ' + series + '. Skipping.' continue print "Uploading " + series + "..." print "Querying from " + str(datetime.fromtimestamp( int(start))) + " to " + str(datetime.fromtimestamp(int(end))) # store points in temporary file, break into 8MB parts with open('/tmp/temp_archive', 'wb') as f: f.write(json.dumps(points)) bytes_read = 0 bytes_sent = 0 with open('/tmp/temp_archive', 'rb') as f: treehash_part = TreeHash() part = f.read(part_size) treehash_part.update(part) bytes_read += len(part) while part: response = glacier.upload_multipart_part( vaultName=settings.GLACIER_VAULT_NAME, uploadId=upload_id, range='bytes ' + str(bytes_sent) + '-' + str(bytes_read - 1) + '/*', body=part, checksum=treehash_part.hexdigest()) bytes_sent += len(part) part = f.read(part_size) treehash_part.update(part) bytes_read += len(part) archive_size += 1 print "Successfully uploaded " + str( bytes_sent) + " bytes to Glacier" print "Deleting points from database..." # drop from fanout series as well series = db.query('list series')[0]['points'] rg = re.compile('device.' + str(device.serial)) for s in series: if rg.search(s[1]): db.query('delete from ' + s[1] + ' where time > ' + str(start) + 's and time < ' + str(end) + 's') print "[DONE]" try: with open('/tmp/temp_archive', 'rb') as f: treehash_archive.update(f.read()) response = glacier.complete_multipart_upload( vaultName=settings.GLACIER_VAULT_NAME, uploadId=upload_id, archiveSize=str(archive_size), checksum=treehash_archive.hexdigest()) with open(settings.STATIC_PATH + 'archive_ids.log', 'a') as f: line = { 'archiveId': response['archiveId'], 'timeEnd': str(end) } f.write(json.dumps(line)) f.write(';') os.remove('/tmp/temp_archive') print "Archival Successful" except: print "No data to archive. Exiting."
def upload(self, client): if (self._inventory_entry.get_state() == FileState.IN_PROGRESS): self._upload_id = self._inventory_entry.get_upload_id() else: tmp_upload = client.initiate_multipart_upload( vaultName=self._vaultName, archiveDescription=self._inventory_entry.get_fileName(), partSize=str(self._partSize)) self._upload_id = tmp_upload['uploadId'] if self._partSize < self._fileSizeBytes: self._inventory_entry.set_state_from_upload( self, FileState.IN_PROGRESS) partBegin = self._partNumUploading * self._partSize data = b"" with open(self._inventory_entry.get_filePath(), "rb") as f: if partBegin: data = f.read(partBegin) treehash = TreeHash(data=data, block_size=self._partSize) while partBegin < self._fileSizeBytes: partEnd = partBegin + self._partSize - 1 if partEnd > self._fileSizeBytes: partEnd = self._fileSizeBytes - 1 part = f.read(self._partSize) treehash.update(part) if not self._startTime: self._startTime = time.time() self._upload_part(client, part, partBegin, partEnd) partBegin = partEnd + 1 self._partNumUploading += 1 if partEnd < self._fileSizeBytes: self._inventory_entry.set_state_from_upload( self, FileState.IN_PROGRESS) completed_treehash = treehash.hexdigest() response = client.complete_multipart_upload( vaultName=self._vaultName, uploadId=self._upload_id, archiveSize=str(self._fileSizeBytes), checksum=completed_treehash) self._endTime = time.time() cli.cli_progress(self._inventory_entry.get_fileName(), self.formattedFileSize(), self.formattedPartSize(), self._startTime, self._fileSizeBytes-1, self._fileSizeBytes-1) # Sanity check that's probably unnecessary. if treehash.hexdigest() != response['checksum']: raise Exception('checksum mismatch') self._checksum = response['checksum'] self._http_status = response['ResponseMetadata']['HTTPStatusCode'] self._archive_id = response['archiveId'] self._upload_location = response['location'] # cli.pp(json.dumps(self, default=lambda o: o.__dict__)) self._inventory_entry.set_state_from_upload(self, FileState.UPLOADED)
def test_constructor(self): self.assertEqual( hashlib.sha256(TEST_DATA).hexdigest(), TreeHash(TEST_DATA).hexdigest())
def test_empty(self): self.assertEqual( hashlib.sha256().hexdigest(), TreeHash(0).hexdigest() )
(fcnt, gbyte, awstmp) = (0, 1024 * 1024 * 1024, 'aws glacier {} --account-id ' + opt['Account']) awstmp = awstmp + ' --vault-name {}'.format(opt['VaultName']) print('## chunk size : {:6.2f} GB'.format(csize / gbyte)) for fn in sys.argv[1:]: if re.match('^--', fn): continue elif not os.path.exists(fn): if opt['Verbose']: print('## file {} cannot be accessed'.format(fn)) continue fs = os.path.getsize(fn) if fs < opt['MinSize']: if opt['Verbose']: print('## file {} is shoter than lower limit {}.'.format( fn, opt['MinSize'])) continue (ft, mt) = (open(fn, 'rb'), TreeHash()) logevent('#{:03} target file: {}'.format(fcnt, fn)) for x in range(0, fs, csize): mt.update(ft.read(csize)) ft.close() arcdsc=datetime.now().strftime('v3/%Y-%m-%d@2%H@1%M@1%S/')+\ fn.replace('@','@0').replace(':','@1').replace(' ','@2').replace('/','@3')+\ '/'+mt.hexdigest()+'/0' while (1): (s, r) = cmd( awstmp.format('initiate-multipart-upload') + ' --archive-description \"{}\" --part-size {}'.format( arcdsc, csize)) if s != 0: errorexit(r) upid = json.loads(r)['uploadId']
for entry in os.scandir(picture_path): logger.debug('Found ' + entry.name + ' at ' + picture_path ) potential_zip = entry.name + '.zip' if entry.is_dir() and re.match(r'\d+[_\-]\d+[_\-]\d+', entry.name) and potential_zip not in decoded_archived: logger.info(entry.name + ' matches the pattern \d+[_\-]\d+[_\-]\d+ and is not currently in Glacier as a zip file.') not_archived.append({'Zipfile' : potential_zip, 'Path' : (picture_path + '\\' + entry.name)}) logger.info(potential_zip + ' added to list of archives that need to be created uploaded.') for n in not_archived: logger.info('Processing ' + str(n)) with zipfile.ZipFile(n['Zipfile'], 'w', allowZip64=True) as zf: logger.info('Writing zipfile ' + n['Path']) writeZipfile(n['Path'], zf) with open(n['Zipfile'], 'rb') as file: expected_fullhash = TreeHash() expected_fullhash.update(file.read()) expected_fullhash_value = expected_fullhash.hexdigest() logger.debug('File: ' + n['Zipfile'] + '. Expected full file hash is ' + str(expected_fullhash_value)) file.seek(0) #Check for multipart upload in progress logger.debug('File: ' + n['Zipfile'] + '. Entering getExistingUploads, passing glacier object and target_vault_name: ' + target_vault_name) existing_uploads = getExistingUploads(glacier, target_vault_name) start_byte = 0 file_size = 0 chunksize = 8388608 fullhash = TreeHash() description = '{"Path":"' + n['Zipfile'] + ', "ExpectedTreeHash":"' + expected_fullhash_value + '"}' logger.info('File: ' + n['Zipfile'] + '. Starting file upload process. File description: ' + description)
def test_constructor(self): self.assertEqual( TEST_CHUNK, len(TreeHash(TEST_CHUNK).hashes) )
print('file name error. should be 21341234.0123456.meta. no leading \'dot\', \'/ or full path') exit(1) elif int(m[2])<2017 or 2020<int(m[2]) or 12 < int(m[3]) or 31 < int(m[4]) or 24 < int(m[5]): print('file name error. should like time stamp') exit(1) (fh,fs)=(open(fn,'rb'),os.path.getsize(fn)) fb=fh.read(csh) fe=DecMethod(fn).decrypt(fb).decode('utf-8') # fe=DecMethod(fn).decrypt(fb) # print(int(fe[0:10])) js=json.loads((fe[10:10+int(fe[0:10])])) if opt['Verbose']: print(js) (mt,dec)=(TreeHash(),DecMethod(fn)) if opt['Verbose']: print('## because verbose option is specified, decoding results is output to %sa' % fn) fw=open(fn+'a','wb') for x in range(js['Size'],0,-cs): fb=dec.decrypt(fh.read(cs)) if( cs <= x ): mt.update(fb) else: mt.update(fb[0:x]) if not opt['Verbose']: continue if( cs <= x ): fw.write(fb) else: fw.write(fb[0:x]) if opt['Verbose']: fw.close() if mt.hexdigest() == js['LongCRC']: continue
def main(): # Parse command line options parser = argparse.ArgumentParser() parser.add_argument("--file", help="File to upload", required=True) parser.add_argument("--size", help="Chunk size in bytes", required=True) parser.add_argument("--account", help="Account ID", default="-") parser.add_argument("--vault", help="Vault Name", required=True) args = parser.parse_args() in_file = open(args.file, "rb") in_file_size = os.path.getsize(args.file) #in_file_sha256 = hashlib.sha256(open(args.file, "rb").read()).hexdigest() treehash = TreeHash(algo=hashlib.sha256) with open(args.file, "rb") as treehash_input: while True: data = treehash_input.read(hash_chunks) treehash.update(data) if len(data) < hash_chunks: break in_file_tree_sha256 = treehash.hexdigest() chunk_size = int(args.size) if chunk_size < min_chunk_size: print("Supplied chunk size (" + args.size + ") is less than minimum. Setting chunk size to " + str(min_chunk_size)) chunk_size = min_chunk_size glacier = boto3.resource('glacier') client = boto3.client('glacier') # multiupload_request = client.initiate_multipart_upload( vaultName=args.vault, archiveDescription=args.file, partSize=str(chunk_size) ) multipart_upload = glacier.MultipartUpload(args.account, args.vault, multiupload_request['uploadId']) print("MultiUpload ID: " + multiupload_request['uploadId']) print("Size: " + str(in_file_size)) #print("Hash: " + in_file_sha256) print("Tree Hash: " + in_file_tree_sha256) position = 0 chunk = in_file.read(chunk_size) while chunk: print("Length: " + str(len(chunk))) print("Current range: bytes " + str(position) + "-" + str(position + len(chunk) - 1) + "/*") response = multipart_upload.upload_part( range="bytes " + str(position) + "-" + str(position + len(chunk) - 1) + "/*", body=chunk ) print("Uploaded Checksum: " + response['checksum']) position += len(chunk) chunk = in_file.read(chunk_size) response = multipart_upload.complete( archiveSize=str(in_file_size), checksum=in_file_tree_sha256 ) print("Upload Complete.") print("Location: " + response['location']) print("Checksum: " + response['checksum']) print("Archive ID: " + response['archiveId']) print("Account ID: " + multipart_upload.account_id) print("Vault name: " + multipart_upload.vault_name) print("ID: " + multipart_upload.id)
def test_digest(self): self.assertEqual( hashlib.md5(TEST_DATA).digest(), TreeHash(TEST_DATA, algo=hashlib.md5).digest())
'Verbose': '--verbose' in sys.argv } for fn in sys.argv[1:]: if fn == '--verbose': continue (fs, fm) = (os.path.getsize(fn), datetime.now().strftime('%Y%m%d.%H%M%S.meta')) ft = ('' if 1024 * 1024 * 1024 * 32 < fs or opt['NoTmp'] else os.environ.get('TMP') + '/') + fm + '.tmp' # print('### tmp=',ft) # exit(1) print('# processing.. ', fn) if opt['Verbose']: print('## formatting to.. ', fm) (mt, enc) = (TreeHash(), EncMethod(fm)) (fh, fw) = (open(fn, 'rb'), open(ft, 'wb')) fb = fh.read(css) mt.update(fb) if fs <= css: fb = (fb + b'@' * 16)[0:int((len(fb) + 15) / 16) * 16] fw.write(enc.encrypt(fb)) meta = { 'FileName': fn, 'Size': fs, 'CTime': os.path.getctime(fn), 'DTime': fm, 'ShortCRC': mt.hexdigest() }