def process_archive_retrieval_job(job,chunk_size,output_path,friendly_name=False): global chunk_count filepos_limit = job.archive_size_in_bytes - 1 pad_length = len(str(job.archive_size_in_bytes // chunk_size)) + 1 current_pos = 0 job_archive_hash = job.archive_sha256_tree_hash chunk_count = 0 failed_parts = {} running_treehash = TreeHash(algo=hashlib.sha256) try: while current_pos < filepos_limit: chunk_count += 1 end_pos = current_pos + (chunk_size - 1) if end_pos > filepos_limit: end_pos = filepos_limit if not process_archive_retrieval_range(job, output_path, current_pos, end_pos, friendly_name, running_treehash): failed_parts["part_" + str(chunk_count).zfill(pad_length)]=[current_pos, end_pos] current_pos = end_pos + 1 if (not failed_parts) and (running_treehash.hexdigest() != job.archive_sha256_tree_hash): failed_parts = None failed_parts["all"] = True except: failed_parts["all"] = True raise return failed_parts
def main(): global Debug # Parse command line options parser = argparse.ArgumentParser() parser.add_argument("--file", help="File Name", required=True) parser.add_argument("--rangesize", help="Size of the range", required=True) parser.add_argument("--chunksize", help="Size of the chunks to use for checksumming.", default=DEFAULT_CHUNK_SIZE) parser.add_argument("--debug", help="Print Debug messages", action="store_true") args = parser.parse_args() Debug = args.debug treehash = TreeHash(algo=hashlib.sha256) statinfo = os.stat(args.file) end_pos = statinfo.st_size range = int(args.rangesize) chunksize = int(args.chunksize) current_pos = 0 while current_pos < end_pos: current_end_pos = current_pos + range if current_end_pos > end_pos: current_end_pos = end_pos running_treehash_on_file_range(treehash, args.file, current_pos, current_end_pos, chunksize) current_pos = current_end_pos print("TreeHash for " + args.file + " is " + treehash.hexdigest())
def test_update(self): treehash = TreeHash() treehash.update(TEST_DATA) self.assertEqual( hashlib.sha256(TEST_DATA).hexdigest(), treehash.hexdigest() )
def test_update(self): tree_hash = TreeHash(TEST_CHUNK) tree_hash.update(TEST_DATA, TEST_INDEX) self.assertEqual( hashlib.sha256(TEST_DATA).hexdigest(), tree_hash.hexdigest() )
def test_md5(self): tree_hash = TreeHash(TEST_CHUNK, algo=hashlib.md5) tree_hash.update(TEST_DATA, TEST_INDEX) self.assertEqual( hashlib.md5(TEST_DATA).hexdigest(), tree_hash.hexdigest() )
def treehash_on_file_range(treehash, filename, start, end, hash_chunk_size=DEFAULT_HASH_CHUNK_SIZE): infile = open(filename, "rb") infile.seek(start) if Debug: print("Treehash: Start: " + str(start) + ", End: " + str(end)) treehash_local = TreeHash(algo=hashlib.sha256) current_pos = start end += 1 while current_pos < end: read_size = end - current_pos if read_size > hash_chunk_size: read_size = hash_chunk_size chunk = infile.read(read_size) if treehash: treehash.update(chunk) treehash_local.update(chunk) current_pos += read_size infile.close() return treehash_local.hexdigest()
def hash_file(fname): treehash = TreeHash(algo=hashlib.sha256) with open(fname, 'rb') as f: while True: buf = f.read(1024 * 1024) if not buf: break treehash.update(buf) return treehash.hexdigest()
def test_tree(self): hashlib_result = hashlib.sha256( hashlib.sha256(TEST_DATA).digest() + hashlib.sha256(TEST_DATA).digest() ).hexdigest() tree_hash = TreeHash(2, block_size=1) tree_hash.update(TEST_DATA, 0) tree_hash.update(TEST_DATA, 1) self.assertEqual(hashlib_result, tree_hash.hexdigest())
def process_archive_retrieval_job(job,chunk_size,output_path,friendly_name=False): global chunk_count filepos_limit = job.archive_size_in_bytes - 1 current_pos = 0 job_archive_hash = job.archive_sha256_tree_hash chunk_count = 0 archive_file_name = output_path + "/" + job.id + ".archive" archive_file = open(archive_file_name, "wb") treehash = TreeHash(algo=hashlib.sha256) while current_pos < filepos_limit: end_pos = current_pos + (chunk_size - 1) if end_pos > filepos_limit: end_pos = filepos_limit range_string = "bytes=" + str(current_pos) + "-" + str(end_pos) response = job.get_output( range=range_string ) if Debug: print("process_archive_retrieval_job: job.get_output() response: " + str(response)) if HTTP_SUCCESS_LOW <= response['status'] <= HTTP_SUCCESS_HIGH: chunk_count += 1 if Debug: #print("Writing chunk " + str(chunk_count) + " " + range_string + " Checksum: " + response['checksum'] + " ContentRange: " + response['contentRange'] + " AcceptRanges: " + response['acceptRanges'] + " ContentType: " + response['contentType'] + " ArchiveDescription: " + response['archiveDescription']) print("Writing chunk " + str(chunk_count) + " " + range_string + " Checksum: " + response['checksum']) #archive_file.write(response['body'].read()) chunk_bytes=response['body'].read() archive_file.write(chunk_bytes) if Debug: chunk_file = open(archive_file_name + ".chunk." + str(chunk_count), "wb") chunk_file.write(chunk_bytes) chunk_file.close section_hash = sha256_on_file_range(archive_file_name, current_pos, end_pos) running_treehash_on_file_range(treehash, archive_file_name, current_pos, end_pos) if Debug: print("Local checksum of chunk " + str(chunk_count) + ": " + section_hash) print("Current running treehash is " + treehash.hexdigest()) current_pos = end_pos + 1 else: print("Response unsuccessful. Retrying") archive_file.close
def run(self): client = self.get_boto_client() logging.info("Initiating job to upload") upload_job = client.initiate_multipart_upload( vaultName=self.vault, archiveDescription=self.file_name, partSize=str(UPLOAD_PART_SIZE)) upload_id = upload_job['uploadId'] treehash = TreeHash(block_size=1024**2) cur_file = open(self.file_location, 'rb') i = 0 for i in range(self.numof_parts - 1): self.cur_part += 1 self.update_task() data = cur_file.read(UPLOAD_PART_SIZE) treehash.update(data) cur_range = 'bytes %d-%d/*' % (i * UPLOAD_PART_SIZE, (i + 1) * UPLOAD_PART_SIZE - 1) client.upload_multipart_part(vaultName=self.vault, uploadId=upload_id, range=cur_range, body=data) self.cur_part += 1 self.update_task() data = cur_file.read(UPLOAD_PART_SIZE) treehash.update(data) cur_range = 'bytes %d-%d/*' % (i * UPLOAD_PART_SIZE, self.file_size - 1) client.upload_multipart_part(vaultName=self.vault, uploadId=upload_id, range=cur_range, body=data) cur_file.close() hash_res = treehash.hexdigest() client.complete_multipart_upload(vaultName=self.vault, uploadId=upload_id, archiveSize=str(self.file_size), checksum=hash_res)
def main(): global Debug # Parse command line options parser = argparse.ArgumentParser() parser.add_argument("--file", help="file to use for hash tests", required=True) parser.add_argument("--outputpath", help="Path to store output", default=DEFAULT_OUTPUT_PATH) parser.add_argument("--chunksize", help="Size of the chunks to use for download. Only valid of the job is ArchiveRetrieval.", default=DEFAULT_CHUNK_SIZE) parser.add_argument("--debug", help="Print Debug messages", action="store_true") args = parser.parse_args() Debug = args.debug ProcessJob_old.set_debug(Debug) chunksize = int(args.chunksize) statinfo = os.stat(args.file) if not ProcessJob_old.is_power_of_2(chunksize): print("Chunksize " + str(chunksize) + " is not a power of two. The next closest power of two is " + str(ProcessJob_old.next_power_of_2(chunksize))) print("Exiting.") sys.exit(1) if chunksize > statinfo.st_size: chunksize = statinfo.st_size current_pos = 0 chunk_count = 0 treehash = TreeHash(algo=hashlib.sha256) while current_pos < statinfo.st_size: chunk_count += 1 end_pos = current_pos + chunksize - 1 if end_pos > statinfo.st_size: end_pos = statinfo.st_size if Debug: print("Processing chunk " + str(chunk_count) + " range " + str(current_pos) + " to " + str(end_pos) ) section_hash = ProcessJob_old.sha256_on_file_range(args.file, current_pos, end_pos) ProcessJob_old.running_treehash_on_file_range(treehash, args.file, current_pos, end_pos) print("Range: " + str(current_pos) + " to " + str(end_pos)) print("Local checksum of chunk " + str(chunk_count) + ": " + section_hash) print("Current running treehash is " + treehash.hexdigest()) current_pos = end_pos + 1
def sha256_on_file_range(filename, start, end, hash_chunk_size=DEFAULT_HASH_CHUNK_SIZE): sha256 = hashlib.sha256() infile = open(filename, "rb") treehash = TreeHash(algo=hashlib.sha256) infile.seek(start) if Debug: print("Running Hash: Start: " + str(start) + ", End: " + str(end)) current_pos = start end += 1 while current_pos < end: read_size = end - current_pos if read_size > hash_chunk_size: read_size = hash_chunk_size chunk = infile.read(read_size) sha256.update(chunk) treehash.update(chunk) current_pos += read_size infile.close() if Debug: print("Running hash for this section (" + str(start) + " to " + str(end) + ") is " + sha256.hexdigest()) print("Tree hash for this section (" + str(start) + " to " + str(end) + ") is " + treehash.hexdigest()) return sha256.hexdigest()
def upload(self, client): if (self._inventory_entry.get_state() == FileState.IN_PROGRESS): self._upload_id = self._inventory_entry.get_upload_id() else: tmp_upload = client.initiate_multipart_upload( vaultName=self._vaultName, archiveDescription=self._inventory_entry.get_fileName(), partSize=str(self._partSize)) self._upload_id = tmp_upload['uploadId'] if self._partSize < self._fileSizeBytes: self._inventory_entry.set_state_from_upload( self, FileState.IN_PROGRESS) partBegin = self._partNumUploading * self._partSize data = b"" with open(self._inventory_entry.get_filePath(), "rb") as f: if partBegin: data = f.read(partBegin) treehash = TreeHash(data=data, block_size=self._partSize) while partBegin < self._fileSizeBytes: partEnd = partBegin + self._partSize - 1 if partEnd > self._fileSizeBytes: partEnd = self._fileSizeBytes - 1 part = f.read(self._partSize) treehash.update(part) if not self._startTime: self._startTime = time.time() self._upload_part(client, part, partBegin, partEnd) partBegin = partEnd + 1 self._partNumUploading += 1 if partEnd < self._fileSizeBytes: self._inventory_entry.set_state_from_upload( self, FileState.IN_PROGRESS) completed_treehash = treehash.hexdigest() response = client.complete_multipart_upload( vaultName=self._vaultName, uploadId=self._upload_id, archiveSize=str(self._fileSizeBytes), checksum=completed_treehash) self._endTime = time.time() cli.cli_progress(self._inventory_entry.get_fileName(), self.formattedFileSize(), self.formattedPartSize(), self._startTime, self._fileSizeBytes-1, self._fileSizeBytes-1) # Sanity check that's probably unnecessary. if treehash.hexdigest() != response['checksum']: raise Exception('checksum mismatch') self._checksum = response['checksum'] self._http_status = response['ResponseMetadata']['HTTPStatusCode'] self._archive_id = response['archiveId'] self._upload_location = response['location'] # cli.pp(json.dumps(self, default=lambda o: o.__dict__)) self._inventory_entry.set_state_from_upload(self, FileState.UPLOADED)
def handle(self, *args, **options): part_size = 8388608 print 'Contacting Amazon AWS...' glacier = boto3.client('glacier') multipart_upload = glacier.initiate_multipart_upload( vaultName=settings.GLACIER_VAULT_NAME, partSize=str(part_size)) print 'Connected to Glacier Vault "' + settings.GLACIER_VAULT_NAME + '"' upload_id = multipart_upload['uploadId'] treehash_archive = TreeHash() db = influxdb.InfluxDBClient(settings.INFLUXDB_URI, 8086, 'root', 'root', 'seads') archive_size = 0 for device in Device.objects.all(): start = datetime.fromtimestamp(0) end = datetime.now() - timedelta(days=31 * device.data_retention_policy) start = (datetime.now() - start).total_seconds() start = 0 end = int((datetime.now() - end).total_seconds()) end = time.time() - end print 'Trying ' + str(device) + '...' print 'Data Retention Policy: ' + str( device.data_retention_policy) + ' Months' series = 'device.' + str(device.serial) try: query = 'select * from ' + series + ' where time > ' + str( start) + 's and time < ' + str(end) + 's' points = db.query(query) except: print 'No data found for ' + series + '. Skipping.' continue print "Uploading " + series + "..." print "Querying from " + str(datetime.fromtimestamp( int(start))) + " to " + str(datetime.fromtimestamp(int(end))) # store points in temporary file, break into 8MB parts with open('/tmp/temp_archive', 'wb') as f: f.write(json.dumps(points)) bytes_read = 0 bytes_sent = 0 with open('/tmp/temp_archive', 'rb') as f: treehash_part = TreeHash() part = f.read(part_size) treehash_part.update(part) bytes_read += len(part) while part: response = glacier.upload_multipart_part( vaultName=settings.GLACIER_VAULT_NAME, uploadId=upload_id, range='bytes ' + str(bytes_sent) + '-' + str(bytes_read - 1) + '/*', body=part, checksum=treehash_part.hexdigest()) bytes_sent += len(part) part = f.read(part_size) treehash_part.update(part) bytes_read += len(part) archive_size += 1 print "Successfully uploaded " + str( bytes_sent) + " bytes to Glacier" print "Deleting points from database..." # drop from fanout series as well series = db.query('list series')[0]['points'] rg = re.compile('device.' + str(device.serial)) for s in series: if rg.search(s[1]): db.query('delete from ' + s[1] + ' where time > ' + str(start) + 's and time < ' + str(end) + 's') print "[DONE]" try: with open('/tmp/temp_archive', 'rb') as f: treehash_archive.update(f.read()) response = glacier.complete_multipart_upload( vaultName=settings.GLACIER_VAULT_NAME, uploadId=upload_id, archiveSize=str(archive_size), checksum=treehash_archive.hexdigest()) with open(settings.STATIC_PATH + 'archive_ids.log', 'a') as f: line = { 'archiveId': response['archiveId'], 'timeEnd': str(end) } f.write(json.dumps(line)) f.write(';') os.remove('/tmp/temp_archive') print "Archival Successful" except: print "No data to archive. Exiting."
def test_update(self): treehash = TreeHash() treehash.update(TEST_DATA) self.assertEqual( hashlib.sha256(TEST_DATA).hexdigest(), treehash.hexdigest())
potential_zip = entry.name + '.zip' if entry.is_dir() and re.match(r'\d+[_\-]\d+[_\-]\d+', entry.name) and potential_zip not in decoded_archived: logger.info(entry.name + ' matches the pattern \d+[_\-]\d+[_\-]\d+ and is not currently in Glacier as a zip file.') not_archived.append({'Zipfile' : potential_zip, 'Path' : (picture_path + '\\' + entry.name)}) logger.info(potential_zip + ' added to list of archives that need to be created uploaded.') for n in not_archived: logger.info('Processing ' + str(n)) with zipfile.ZipFile(n['Zipfile'], 'w', allowZip64=True) as zf: logger.info('Writing zipfile ' + n['Path']) writeZipfile(n['Path'], zf) with open(n['Zipfile'], 'rb') as file: expected_fullhash = TreeHash() expected_fullhash.update(file.read()) expected_fullhash_value = expected_fullhash.hexdigest() logger.debug('File: ' + n['Zipfile'] + '. Expected full file hash is ' + str(expected_fullhash_value)) file.seek(0) #Check for multipart upload in progress logger.debug('File: ' + n['Zipfile'] + '. Entering getExistingUploads, passing glacier object and target_vault_name: ' + target_vault_name) existing_uploads = getExistingUploads(glacier, target_vault_name) start_byte = 0 file_size = 0 chunksize = 8388608 fullhash = TreeHash() description = '{"Path":"' + n['Zipfile'] + ', "ExpectedTreeHash":"' + expected_fullhash_value + '"}' logger.info('File: ' + n['Zipfile'] + '. Starting file upload process. File description: ' + description) logger.info('File: ' + n['Zipfile'] + '. Chunksize: ' + str(chunksize))
if opt['Verbose']: print("## upload id " + upid) if upid[0] != '-': break (rp, scnt, ft, awsids) = (0, 0, open(fn, 'rb'), awstmp + ' --upload-id {}'.format(upid)) for x in range(0, fs, csize): rs = ft.read(csize) (fl, fp) = (len(rs), opt['TempFile'].format(fcnt, scnt)) with open(fp, 'wb') as fw: fw.write(rs) mp = TreeHash() mp.update(rs) (s,r)=cmd(awsids.format('upload-multipart-part')+' --body \'{}\''.format(fp)+\ ' --range \"bytes {}-{}/*\" --checksum \"{}\"'.format(rp,rp+fl-1,mp.hexdigest())) if s != 0: errorexit(r) if opt['Verbose']: print('## done part {:2}, {:6.2f} GB ({:12} b)'.format( scnt, fl / gbyte, fl), flush=True) elif scnt == 0: print('## done part {:2}'.format(scnt), end='', flush=True) else: print(' {:2}'.format(scnt), end='', flush=True) (rp, scnt) = (rp + fl, scnt + 1) os.remove(fp) ft.close() print('') (s, r) = cmd(
def main(): # Parse command line options parser = argparse.ArgumentParser() parser.add_argument("--file", help="File to upload", required=True) parser.add_argument("--size", help="Chunk size in bytes", required=True) parser.add_argument("--account", help="Account ID", default="-") parser.add_argument("--vault", help="Vault Name", required=True) args = parser.parse_args() in_file = open(args.file, "rb") in_file_size = os.path.getsize(args.file) #in_file_sha256 = hashlib.sha256(open(args.file, "rb").read()).hexdigest() treehash = TreeHash(algo=hashlib.sha256) with open(args.file, "rb") as treehash_input: while True: data = treehash_input.read(hash_chunks) treehash.update(data) if len(data) < hash_chunks: break in_file_tree_sha256 = treehash.hexdigest() chunk_size = int(args.size) if chunk_size < min_chunk_size: print("Supplied chunk size (" + args.size + ") is less than minimum. Setting chunk size to " + str(min_chunk_size)) chunk_size = min_chunk_size glacier = boto3.resource('glacier') client = boto3.client('glacier') # multiupload_request = client.initiate_multipart_upload( vaultName=args.vault, archiveDescription=args.file, partSize=str(chunk_size) ) multipart_upload = glacier.MultipartUpload(args.account, args.vault, multiupload_request['uploadId']) print("MultiUpload ID: " + multiupload_request['uploadId']) print("Size: " + str(in_file_size)) #print("Hash: " + in_file_sha256) print("Tree Hash: " + in_file_tree_sha256) position = 0 chunk = in_file.read(chunk_size) while chunk: print("Length: " + str(len(chunk))) print("Current range: bytes " + str(position) + "-" + str(position + len(chunk) - 1) + "/*") response = multipart_upload.upload_part( range="bytes " + str(position) + "-" + str(position + len(chunk) - 1) + "/*", body=chunk ) print("Uploaded Checksum: " + response['checksum']) position += len(chunk) chunk = in_file.read(chunk_size) response = multipart_upload.complete( archiveSize=str(in_file_size), checksum=in_file_tree_sha256 ) print("Upload Complete.") print("Location: " + response['location']) print("Checksum: " + response['checksum']) print("Archive ID: " + response['archiveId']) print("Account ID: " + multipart_upload.account_id) print("Vault name: " + multipart_upload.vault_name) print("ID: " + multipart_upload.id)