def punctuate_character(char: str) -> int: """Create a punctuation based on special characters and upper letters""" punctuation = 0 if not unicodedata.is_normalized("NFKD", char): punctuation += 30 if char.isupper(): punctuation += 10 return punctuation
def simple_truncate(unistr: str, maxsize: int) -> str: # from https://joernhees.de/blog/2010/12/14/how-to-restrict-the-length-of-a-unicode-string/ import unicodedata if not unicodedata.is_normalized("NFC", unistr): unistr = unicodedata.normalize("NFC", unistr) return str(unistr.encode("utf-8")[:maxsize], encoding="utf-8", errors='ignore')
def get_normalization_form(string): """Return Dictionary of Normalization Forms.""" forms = ['NFC', 'NFKC', 'NFD', 'NFKD'] normalization_form = dict() for form in forms: if ud.is_normalized(form, string): normalization_form[form] = True else: normalization_form[form] = False return normalization_form
def run_normalization_tests(self, testdata): part = None part1_data = {} def NFC(str): return unicodedata.normalize("NFC", str) def NFKC(str): return unicodedata.normalize("NFKC", str) def NFD(str): return unicodedata.normalize("NFD", str) def NFKD(str): return unicodedata.normalize("NFKD", str) for line in testdata: if '#' in line: line = line.split('#')[0] line = line.strip() if not line: continue if line.startswith("@Part"): part = line.split()[0] continue c1, c2, c3, c4, c5 = [self.unistr(x) for x in line.split(';')[:-1]] # Perform tests self.assertTrue(c2 == NFC(c1) == NFC(c2) == NFC(c3), line) self.assertTrue(c4 == NFC(c4) == NFC(c5), line) self.assertTrue(c3 == NFD(c1) == NFD(c2) == NFD(c3), line) self.assertTrue(c5 == NFD(c4) == NFD(c5), line) self.assertTrue(c4 == NFKC(c1) == NFKC(c2) == \ NFKC(c3) == NFKC(c4) == NFKC(c5), line) self.assertTrue(c5 == NFKD(c1) == NFKD(c2) == \ NFKD(c3) == NFKD(c4) == NFKD(c5), line) self.assertTrue(unicodedata.is_normalized("NFC", c2)) self.assertTrue(unicodedata.is_normalized("NFC", c4)) self.assertTrue(unicodedata.is_normalized("NFD", c3)) self.assertTrue(unicodedata.is_normalized("NFD", c5)) self.assertTrue(unicodedata.is_normalized("NFKC", c4)) self.assertTrue(unicodedata.is_normalized("NFKD", c5)) # Record part 1 data if part == "@Part1": part1_data[c1] = 1 # Perform tests for all other data for c in range(sys.maxunicode + 1): X = chr(c) if X in part1_data: continue self.assertTrue(X == NFC(X) == NFD(X) == NFKC(X) == NFKD(X), c)
def run_normalization_tests(self, testdata): part = None part1_data = {} for line in testdata: if '#' in line: line = line.split('#')[0] line = line.strip() if not line: continue if line.startswith("@Part"): part = line.split()[0] continue try: c1, c2, c3, c4, c5 = [unistr(x) for x in line.split(';')[:-1]] except RangeError: # Skip unsupported characters; # try at least adding c1 if we are in part1 if part == "@Part1": try: c1 = unistr(line.split(';')[0]) except RangeError: pass else: part1_data[c1] = 1 continue # Perform tests self.assertTrue(c2 == NFC(c1) == NFC(c2) == NFC(c3), line) self.assertTrue(c4 == NFC(c4) == NFC(c5), line) self.assertTrue(c3 == NFD(c1) == NFD(c2) == NFD(c3), line) self.assertTrue(c5 == NFD(c4) == NFD(c5), line) self.assertTrue(c4 == NFKC(c1) == NFKC(c2) == \ NFKC(c3) == NFKC(c4) == NFKC(c5), line) self.assertTrue(c5 == NFKD(c1) == NFKD(c2) == \ NFKD(c3) == NFKD(c4) == NFKD(c5), line) self.assertTrue(is_normalized("NFC", c2)) self.assertTrue(is_normalized("NFC", c4)) self.assertTrue(is_normalized("NFD", c3)) self.assertTrue(is_normalized("NFD", c5)) self.assertTrue(is_normalized("NFKC", c4)) self.assertTrue(is_normalized("NFKD", c5)) # Record part 1 data if part == "@Part1": part1_data[c1] = 1 # Perform tests for all other data for c in range(sys.maxunicode + 1): X = chr(c) if X in part1_data: continue self.assertTrue(X == NFC(X) == NFD(X) == NFKC(X) == NFKD(X), c)
def run_normalization_tests(self, testdata): part = None part1_data = {} for line in testdata: if '#' in line: line = line.split('#')[0] line = line.strip() if not line: continue if line.startswith("@Part"): part = line.split()[0] continue try: c1,c2,c3,c4,c5 = [unistr(x) for x in line.split(';')[:-1]] except RangeError: # Skip unsupported characters; # try at least adding c1 if we are in part1 if part == "@Part1": try: c1 = unistr(line.split(';')[0]) except RangeError: pass else: part1_data[c1] = 1 continue # Perform tests self.assertTrue(c2 == NFC(c1) == NFC(c2) == NFC(c3), line) self.assertTrue(c4 == NFC(c4) == NFC(c5), line) self.assertTrue(c3 == NFD(c1) == NFD(c2) == NFD(c3), line) self.assertTrue(c5 == NFD(c4) == NFD(c5), line) self.assertTrue(c4 == NFKC(c1) == NFKC(c2) == \ NFKC(c3) == NFKC(c4) == NFKC(c5), line) self.assertTrue(c5 == NFKD(c1) == NFKD(c2) == \ NFKD(c3) == NFKD(c4) == NFKD(c5), line) self.assertTrue(is_normalized("NFC", c2)) self.assertTrue(is_normalized("NFC", c4)) self.assertTrue(is_normalized("NFD", c3)) self.assertTrue(is_normalized("NFD", c5)) self.assertTrue(is_normalized("NFKC", c4)) self.assertTrue(is_normalized("NFKD", c5)) # Record part 1 data if part == "@Part1": part1_data[c1] = 1 # Perform tests for all other data for c in range(sys.maxunicode+1): X = chr(c) if X in part1_data: continue self.assertTrue(X == NFC(X) == NFD(X) == NFKC(X) == NFKD(X), c)
def _explain_char(self, ch, further): try: name = unicodedata.name(ch) except ValueError: name = f'[U+{hex(ord(ch))[2:]}]' if not further: return name + f'({ch})' infos = { 'category': unicodedata.category(ch), 'direction': unicodedata.bidirectional(ch), 'east asian width': unicodedata.east_asian_width(ch) } decomposition = unicodedata.decomposition(ch) if decomposition: infos['decomposition'] = decomposition try: infos['digit value'] = unicodedata.digit(ch) except ValueError: pass try: infos['decimal value'] = unicodedata.decimal(ch) except ValueError: pass try: infos['numeric value'] = unicodedata.numeric(ch) except ValueError: pass comb = unicodedata.combining(ch) if comb != 0: infos['combining class'] = str(comb) mirrored = unicodedata.mirrored(ch) if mirrored: infos['mirrored'] = 'yes' if hasattr(unicodedata, 'is_normalized'): forms = [] for form in ('NFC', 'NFD', 'NFKC', 'NFKD'): if unicodedata.is_normalized(form, ch): forms.append(form) if forms: infos['normalized'] = f'yes: {", ".join(forms)}' else: infos['normalized'] = 'no' else: infos['normalized'] = 'unavailable' info = ', '.join([f'{k}: {v}' for k, v in infos.items()]) return f'{name}: {ch!r} ({info})'
def check_unicode_data( dataset: pycldf.Dataset, unicode_form: str = "NFC", logger: cli.logging.Logger = cli.logger, ) -> bool: for table in dataset.tables: for r, row in enumerate(table, 1): for value in row.values(): if isinstance(value, str): if not unicodedata.is_normalized(unicode_form, value): log_or_raise( message= f"Value {value} of row {r} in table {table.url} is not in {unicode_form} normalized unicode", log=logger, ) return False return True
def sort_key(self): """Returns a sortable key representing this schema. This is used to decide which schema among many mergeable schemas should be the canonical one. The sort key is therefore generally the one that produces the glyph name that is the shortest, the most informative, and the most likely match for the original code point sequence. """ cmap_string = '' if self.cmap is None else chr(self.cmap) return ( bool(self.cps) and any(unicodedata.category(chr(cp)) == 'Co' for cp in self.cps), self.phase_index, self.cmap is None, not unicodedata.is_normalized('NFD', cmap_string), not self.cps, len(self.cps), self.original_shape != type(self.path), self.cps, len(self._calculate_name()), )
def lambda_handler(event, context): logger.debug('## EVENT\r' + jsonpickle.encode(dict(**event))) jobId = event['job']['id'] invocationId = event['invocationId'] invocationSchemaVersion = event['invocationSchemaVersion'] taskId = event['tasks'][0]['taskId'] sourceKey = urllib.parse.unquote_plus(event['tasks'][0]['s3Key']) s3BucketArn = event['tasks'][0]['s3BucketArn'] sourceBucket = s3BucketArn.split(':::')[-1] results = [] # Prepare result code and string resultCode = None resultString = None minsizeforbatch = int(os.environ['MN_SIZE_FOR_BATCH_IN_BYTES']) # Copy object to new bucket with new key name try: logger.debug("preflight check start") #preflight checks _read_ pre_flight_response = s3client.head_object(Bucket=sourceBucket, Key=sourceKey) logger.debug('## PREFLIGHT_RESPONSE\r' + jsonpickle.encode(dict(**pre_flight_response))) if 'DeleteMarker' in pre_flight_response: if pre_flight_response['pre_flight_response'] == True: raise Exception('Object ' + sourceKey + ' is deleted') size = pre_flight_response['ContentLength'] destinationBucket = os.environ['DESTINATION_BUCKET_NAME'] logger.debug("preflight check end") if (size > minsizeforbatch): unsupportedStorageClass = False #Storage class check if 'StorageClass' in pre_flight_response: if pre_flight_response['StorageClass'] in [ 'GLACIER', 'DEEP_ARCHIVE' ]: #check restore status: if 'Restore' in pre_flight_response: restore = pre_flight_response['Restore'] logger.debug(restore) if 'ongoing-request="false"' not in restore: logger.info('restore is in progress') raise Exception( 'Object ' + sourceKey + ' is restoring from ' + pre_flight_response['StorageClass']) else: unsupportedStorageClass = True if (unsupportedStorageClass): raise Exception('Object ' + sourceKey + ' is in unsupported StorageClass ' + pre_flight_response['StorageClass']) #NFC for unicodedata if unicodedata.is_normalized('NFC', sourceKey) == False: raise Exception('Object ' + sourceKey + ' is not in Normalized Form C') if (is_can_submit_jobs() == False): logger.info("too many jobs pending. returning slowdown") resultCode = 'TemporaryFailure' resultString = 'Retry request to batch due to too many pending jobs.' else: logger.debug("job submission start") #submit job response = batchclient.submit_job( jobName="MediaSyncJob", jobQueue=os.environ['JOB_QUEUE'], jobDefinition=os.environ['JOB_DEFINITION'], parameters={ 'SourceS3Uri': 's3://' + sourceBucket + '/' + sourceKey, 'DestinationS3Uri': 's3://' + destinationBucket + '/' + sourceKey, 'Size': str(size) }, tags={ 'S3BatchJobId': jobId, 'SourceBucket': sourceBucket, 'DestinationBucket': destinationBucket, 'Key': sourceKey, 'Size': str(size) }) logger.debug('## BATCH_RESPONSE\r' + jsonpickle.encode(dict(**pre_flight_response))) logger.debug("job submission complete") resultCode = 'Succeeded' detail = 'https://console.aws.amazon.com/batch/v2/home?region=' + os.environ[ 'AWS_REGION'] + '#jobs/detail/' + response['jobId'] resultString = detail resultCode = 'Succeeded' else: # <5GB copy_response = {} if (os.environ['IS_BUCKET_OWNER_FULL_CONTROL'] == 'FALSE'): copy_response = s3client.copy_object(Bucket=destinationBucket, CopySource={ 'Bucket': sourceBucket, 'Key': sourceKey }, Key=sourceKey) else: copy_response = s3client.copy_object( Bucket=destinationBucket, CopySource={ 'Bucket': sourceBucket, 'Key': sourceKey }, ACL='bucket-owner-full-control', Key=sourceKey) logger.debug('## COPY_RESPONSE\r' + jsonpickle.encode(dict(**copy_response))) resultString = 'Lambda copy complete' resultCode = 'Succeeded' except ClientError as e: # If request timed out, mark as a temp failure # and S3 Batch Operations will make the task for retry. If # any other exceptions are received, mark as permanent failure. errorCode = e.response['Error']['Code'] errorMessage = e.response['Error']['Message'] logger.debug(errorMessage) if errorCode == 'TooManyRequestsException': resultCode = 'TemporaryFailure' resultString = 'Retry request to batch due to throttling.' elif errorCode == 'RequestTimeout': resultCode = 'TemporaryFailure' resultString = 'Retry request to Amazon S3 due to timeout.' elif (errorCode == '304'): resultCode = 'Succeeded' resultString = 'Not modified' elif (errorCode == 'SlowDown'): resultCode = 'TemporaryFailure' resultString = 'Retry request to s3 due to throttling.' else: resultCode = 'PermanentFailure' resultString = '{}: {}'.format(errorCode, errorMessage) except Exception as e: # Catch all exceptions to permanently fail the task resultCode = 'PermanentFailure' resultString = 'Exception: {}'.format(e) finally: results.append({ 'taskId': taskId, 'resultCode': resultCode, 'resultString': resultString }) logger.info(resultCode + " # " + resultString) return { 'invocationSchemaVersion': invocationSchemaVersion, 'treatMissingKeysAs': 'PermanentFailure', 'invocationId': invocationId, 'results': results }
# Load names and RGB values from colorNames.csv into a list of dictionaries to # be used by the colorByName() function. The columns, in order, are name, hex colorDictList = [] with open("colorNames.csv", newline='') as csvColors: colorReader = csv.DictReader(csvColors) for row in colorReader: colorDictList.append(row) csvColors.close() # Normalize every name in the list of color dictionaries (replace all 'special' # characters with their ASCII counterparts, e.g., á becomes a). While this may # not be the ideal solution, it will work fine enough for these purposes. for color in colorDictList: # If the color name is already normalized, skip it if unicodedata.is_normalized("NFD", color["name"]): continue # Normalize the color name and remove the decomposed non-ASCII strings norm = unicodedata.normalize("NFD", color["name"]) encodedString = norm.encode("ascii", "ignore") color["name"] = encodedString.decode() # Searches through the list of colorDicts and returns the hex value for the # given color if the name is found. If not, a NameError is raised def findNamedColorHex(color): # These bounds are changed as the binary search occurs lowerBound = 0 upperBound = len(colorDictList) - 1 # Search until the bounds overlap while lowerBound <= upperBound: #for foo in range(0,20):
def is_nfd(x): return is_normalized('NFD', x)
def is_nfc(x): return is_normalized('NFC', x)
def _submit_job(sourceBucket, sourceKey): logger.debug("preflight check start") #preflight checks _read_ pre_flight_response = s3client.head_object( Bucket=sourceBucket, Key=sourceKey ) logger.debug('## PREFLIGHT_RESPONSE\r' + jsonpickle.encode(dict(**pre_flight_response))) if 'DeleteMarker' in pre_flight_response: if pre_flight_response['pre_flight_response'] == True: raise Exception('Object ' + sourceKey + ' is deleted') size = pre_flight_response['ContentLength'] logger.debug("preflight check end") unsupportedStorageClass = False #Storage class check if 'StorageClass' in pre_flight_response: if pre_flight_response['StorageClass'] in ['GLACIER', 'DEEP_ARCHIVE']: #check restore status: if 'Restore' in pre_flight_response: restore = pre_flight_response['Restore'] logger.debug(restore) if 'ongoing-request="false"' not in restore: logger.info('restore is in progress') raise Exception('Object ' + sourceKey + ' is restoring from ' + pre_flight_response['StorageClass']) else: unsupportedStorageClass = True if (unsupportedStorageClass): raise Exception('Object ' + sourceKey + ' is in unsupported StorageClass ' + pre_flight_response['StorageClass']) #NFC for unicodedata if unicodedata.is_normalized('NFC', sourceKey) == False: raise Exception('Object ' + sourceKey + ' is not in Normalized Form C' ) # use bigger containers for 10GB+ logger.debug("job submission start") jobDefinition = os.environ['JOB_SIZE_SMALL'] if pre_flight_response['ContentLength'] < int(os.environ['JOB_SIZE_THRESHOLD']) else os.environ['JOB_SIZE_LARGE'] logger.debug("job definition is " + jobDefinition) logger.debug("job submission start") #submit job response = batchclient.submit_job( jobName="Fixity", jobQueue=os.environ['JOB_QUEUE'], jobDefinition=jobDefinition, parameters={ 'Bucket': sourceBucket, 'Key': sourceKey }, propagateTags=True, tags={ 'Bucket': sourceBucket, 'Key': sourceKey, 'Size': str(pre_flight_response['ContentLength']) } ) logger.debug('## BATCH_RESPONSE\r' + jsonpickle.encode(dict(**response))) logger.debug("job submission complete") return response['jobId']