Exemplo n.º 1
0
def punctuate_character(char: str) -> int:
    """Create a punctuation based on special characters and upper letters"""
    punctuation = 0
    if not unicodedata.is_normalized("NFKD", char):
        punctuation += 30
    if char.isupper():
        punctuation += 10
    return punctuation
Exemplo n.º 2
0
def simple_truncate(unistr: str, maxsize: int) -> str:
    # from https://joernhees.de/blog/2010/12/14/how-to-restrict-the-length-of-a-unicode-string/
    import unicodedata
    if not unicodedata.is_normalized("NFC", unistr):
        unistr = unicodedata.normalize("NFC", unistr)
    return str(unistr.encode("utf-8")[:maxsize],
               encoding="utf-8",
               errors='ignore')
Exemplo n.º 3
0
def get_normalization_form(string):
    """Return Dictionary of Normalization Forms."""

    forms = ['NFC', 'NFKC', 'NFD', 'NFKD']
    normalization_form = dict()

    for form in forms:
        if ud.is_normalized(form, string):
            normalization_form[form] = True
        else:
            normalization_form[form] = False
    return normalization_form
Exemplo n.º 4
0
    def run_normalization_tests(self, testdata):
        part = None
        part1_data = {}

        def NFC(str):
            return unicodedata.normalize("NFC", str)

        def NFKC(str):
            return unicodedata.normalize("NFKC", str)

        def NFD(str):
            return unicodedata.normalize("NFD", str)

        def NFKD(str):
            return unicodedata.normalize("NFKD", str)

        for line in testdata:
            if '#' in line:
                line = line.split('#')[0]
            line = line.strip()
            if not line:
                continue
            if line.startswith("@Part"):
                part = line.split()[0]
                continue
            c1, c2, c3, c4, c5 = [self.unistr(x) for x in line.split(';')[:-1]]

            # Perform tests
            self.assertTrue(c2 == NFC(c1) == NFC(c2) == NFC(c3), line)
            self.assertTrue(c4 == NFC(c4) == NFC(c5), line)
            self.assertTrue(c3 == NFD(c1) == NFD(c2) == NFD(c3), line)
            self.assertTrue(c5 == NFD(c4) == NFD(c5), line)
            self.assertTrue(c4 == NFKC(c1) == NFKC(c2) == \
                            NFKC(c3) == NFKC(c4) == NFKC(c5),
                            line)
            self.assertTrue(c5 == NFKD(c1) == NFKD(c2) == \
                            NFKD(c3) == NFKD(c4) == NFKD(c5),
                            line)

            self.assertTrue(unicodedata.is_normalized("NFC", c2))
            self.assertTrue(unicodedata.is_normalized("NFC", c4))

            self.assertTrue(unicodedata.is_normalized("NFD", c3))
            self.assertTrue(unicodedata.is_normalized("NFD", c5))

            self.assertTrue(unicodedata.is_normalized("NFKC", c4))
            self.assertTrue(unicodedata.is_normalized("NFKD", c5))

            # Record part 1 data
            if part == "@Part1":
                part1_data[c1] = 1

        # Perform tests for all other data
        for c in range(sys.maxunicode + 1):
            X = chr(c)
            if X in part1_data:
                continue
            self.assertTrue(X == NFC(X) == NFD(X) == NFKC(X) == NFKD(X), c)
Exemplo n.º 5
0
    def run_normalization_tests(self, testdata):
        part = None
        part1_data = {}

        for line in testdata:
            if '#' in line:
                line = line.split('#')[0]
            line = line.strip()
            if not line:
                continue
            if line.startswith("@Part"):
                part = line.split()[0]
                continue
            try:
                c1, c2, c3, c4, c5 = [unistr(x) for x in line.split(';')[:-1]]
            except RangeError:
                # Skip unsupported characters;
                # try at least adding c1 if we are in part1
                if part == "@Part1":
                    try:
                        c1 = unistr(line.split(';')[0])
                    except RangeError:
                        pass
                    else:
                        part1_data[c1] = 1
                continue

            # Perform tests
            self.assertTrue(c2 == NFC(c1) == NFC(c2) == NFC(c3), line)
            self.assertTrue(c4 == NFC(c4) == NFC(c5), line)
            self.assertTrue(c3 == NFD(c1) == NFD(c2) == NFD(c3), line)
            self.assertTrue(c5 == NFD(c4) == NFD(c5), line)
            self.assertTrue(c4 == NFKC(c1) == NFKC(c2) == \
                            NFKC(c3) == NFKC(c4) == NFKC(c5),
                            line)
            self.assertTrue(c5 == NFKD(c1) == NFKD(c2) == \
                            NFKD(c3) == NFKD(c4) == NFKD(c5),
                            line)

            self.assertTrue(is_normalized("NFC", c2))
            self.assertTrue(is_normalized("NFC", c4))

            self.assertTrue(is_normalized("NFD", c3))
            self.assertTrue(is_normalized("NFD", c5))

            self.assertTrue(is_normalized("NFKC", c4))
            self.assertTrue(is_normalized("NFKD", c5))

            # Record part 1 data
            if part == "@Part1":
                part1_data[c1] = 1

        # Perform tests for all other data
        for c in range(sys.maxunicode + 1):
            X = chr(c)
            if X in part1_data:
                continue
            self.assertTrue(X == NFC(X) == NFD(X) == NFKC(X) == NFKD(X), c)
Exemplo n.º 6
0
    def run_normalization_tests(self, testdata):
        part = None
        part1_data = {}

        for line in testdata:
            if '#' in line:
                line = line.split('#')[0]
            line = line.strip()
            if not line:
                continue
            if line.startswith("@Part"):
                part = line.split()[0]
                continue
            try:
                c1,c2,c3,c4,c5 = [unistr(x) for x in line.split(';')[:-1]]
            except RangeError:
                # Skip unsupported characters;
                # try at least adding c1 if we are in part1
                if part == "@Part1":
                    try:
                        c1 = unistr(line.split(';')[0])
                    except RangeError:
                        pass
                    else:
                        part1_data[c1] = 1
                continue

            # Perform tests
            self.assertTrue(c2 ==  NFC(c1) ==  NFC(c2) ==  NFC(c3), line)
            self.assertTrue(c4 ==  NFC(c4) ==  NFC(c5), line)
            self.assertTrue(c3 ==  NFD(c1) ==  NFD(c2) ==  NFD(c3), line)
            self.assertTrue(c5 ==  NFD(c4) ==  NFD(c5), line)
            self.assertTrue(c4 == NFKC(c1) == NFKC(c2) == \
                            NFKC(c3) == NFKC(c4) == NFKC(c5),
                            line)
            self.assertTrue(c5 == NFKD(c1) == NFKD(c2) == \
                            NFKD(c3) == NFKD(c4) == NFKD(c5),
                            line)

            self.assertTrue(is_normalized("NFC", c2))
            self.assertTrue(is_normalized("NFC", c4))

            self.assertTrue(is_normalized("NFD", c3))
            self.assertTrue(is_normalized("NFD", c5))

            self.assertTrue(is_normalized("NFKC", c4))
            self.assertTrue(is_normalized("NFKD", c5))

            # Record part 1 data
            if part == "@Part1":
                part1_data[c1] = 1

        # Perform tests for all other data
        for c in range(sys.maxunicode+1):
            X = chr(c)
            if X in part1_data:
                continue
            self.assertTrue(X == NFC(X) == NFD(X) == NFKC(X) == NFKD(X), c)
Exemplo n.º 7
0
    def _explain_char(self, ch, further):
        try:
            name = unicodedata.name(ch)
        except ValueError:
            name = f'[U+{hex(ord(ch))[2:]}]'
        if not further:
            return name + f'({ch})'
        infos = {
            'category': unicodedata.category(ch),
            'direction': unicodedata.bidirectional(ch),
            'east asian width': unicodedata.east_asian_width(ch)
        }

        decomposition = unicodedata.decomposition(ch)
        if decomposition:
            infos['decomposition'] = decomposition

        try:
            infos['digit value'] = unicodedata.digit(ch)
        except ValueError:
            pass
        try:
            infos['decimal value'] = unicodedata.decimal(ch)
        except ValueError:
            pass
        try:
            infos['numeric value'] = unicodedata.numeric(ch)
        except ValueError:
            pass
        comb = unicodedata.combining(ch)
        if comb != 0:
            infos['combining class'] = str(comb)

        mirrored = unicodedata.mirrored(ch)
        if mirrored:
            infos['mirrored'] = 'yes'
        if hasattr(unicodedata, 'is_normalized'):
            forms = []
            for form in ('NFC', 'NFD', 'NFKC', 'NFKD'):
                if unicodedata.is_normalized(form, ch):
                    forms.append(form)
            if forms:
                infos['normalized'] = f'yes: {", ".join(forms)}'
            else:
                infos['normalized'] = 'no'
        else:
            infos['normalized'] = 'unavailable'

        info = ', '.join([f'{k}: {v}' for k, v in infos.items()])
        return f'{name}: {ch!r} ({info})'
Exemplo n.º 8
0
def check_unicode_data(
    dataset: pycldf.Dataset,
    unicode_form: str = "NFC",
    logger: cli.logging.Logger = cli.logger,
) -> bool:
    for table in dataset.tables:
        for r, row in enumerate(table, 1):
            for value in row.values():
                if isinstance(value, str):
                    if not unicodedata.is_normalized(unicode_form, value):
                        log_or_raise(
                            message=
                            f"Value {value} of row {r} in table {table.url} is not in {unicode_form} normalized unicode",
                            log=logger,
                        )
                        return False
    return True
Exemplo n.º 9
0
    def sort_key(self):
        """Returns a sortable key representing this schema.

        This is used to decide which schema among many mergeable schemas
        should be the canonical one. The sort key is therefore generally
        the one that produces the glyph name that is the shortest, the
        most informative, and the most likely match for the original
        code point sequence.
        """
        cmap_string = '' if self.cmap is None else chr(self.cmap)
        return (
            bool(self.cps)
            and any(unicodedata.category(chr(cp)) == 'Co' for cp in self.cps),
            self.phase_index,
            self.cmap is None,
            not unicodedata.is_normalized('NFD', cmap_string),
            not self.cps,
            len(self.cps),
            self.original_shape != type(self.path),
            self.cps,
            len(self._calculate_name()),
        )
Exemplo n.º 10
0
def lambda_handler(event, context):

    logger.debug('## EVENT\r' + jsonpickle.encode(dict(**event)))

    jobId = event['job']['id']
    invocationId = event['invocationId']
    invocationSchemaVersion = event['invocationSchemaVersion']

    taskId = event['tasks'][0]['taskId']
    sourceKey = urllib.parse.unquote_plus(event['tasks'][0]['s3Key'])
    s3BucketArn = event['tasks'][0]['s3BucketArn']
    sourceBucket = s3BucketArn.split(':::')[-1]

    results = []
    # Prepare result code and string
    resultCode = None
    resultString = None

    minsizeforbatch = int(os.environ['MN_SIZE_FOR_BATCH_IN_BYTES'])

    # Copy object to new bucket with new key name
    try:
        logger.debug("preflight check start")

        #preflight checks _read_
        pre_flight_response = s3client.head_object(Bucket=sourceBucket,
                                                   Key=sourceKey)

        logger.debug('## PREFLIGHT_RESPONSE\r' +
                     jsonpickle.encode(dict(**pre_flight_response)))

        if 'DeleteMarker' in pre_flight_response:
            if pre_flight_response['pre_flight_response'] == True:
                raise Exception('Object ' + sourceKey + ' is deleted')

        size = pre_flight_response['ContentLength']
        destinationBucket = os.environ['DESTINATION_BUCKET_NAME']

        logger.debug("preflight check end")

        if (size > minsizeforbatch):

            unsupportedStorageClass = False

            #Storage class check
            if 'StorageClass' in pre_flight_response:
                if pre_flight_response['StorageClass'] in [
                        'GLACIER', 'DEEP_ARCHIVE'
                ]:
                    #check restore status:
                    if 'Restore' in pre_flight_response:
                        restore = pre_flight_response['Restore']
                        logger.debug(restore)
                        if 'ongoing-request="false"' not in restore:
                            logger.info('restore is in progress')
                            raise Exception(
                                'Object ' + sourceKey + ' is restoring from ' +
                                pre_flight_response['StorageClass'])
                    else:
                        unsupportedStorageClass = True

            if (unsupportedStorageClass):
                raise Exception('Object ' + sourceKey +
                                ' is in unsupported StorageClass ' +
                                pre_flight_response['StorageClass'])

            #NFC for unicodedata
            if unicodedata.is_normalized('NFC', sourceKey) == False:
                raise Exception('Object ' + sourceKey +
                                ' is not in Normalized Form C')

            if (is_can_submit_jobs() == False):

                logger.info("too many jobs pending. returning slowdown")
                resultCode = 'TemporaryFailure'
                resultString = 'Retry request to batch due to too many pending jobs.'

            else:

                logger.debug("job submission start")

                #submit job
                response = batchclient.submit_job(
                    jobName="MediaSyncJob",
                    jobQueue=os.environ['JOB_QUEUE'],
                    jobDefinition=os.environ['JOB_DEFINITION'],
                    parameters={
                        'SourceS3Uri':
                        's3://' + sourceBucket + '/' + sourceKey,
                        'DestinationS3Uri':
                        's3://' + destinationBucket + '/' + sourceKey,
                        'Size': str(size)
                    },
                    tags={
                        'S3BatchJobId': jobId,
                        'SourceBucket': sourceBucket,
                        'DestinationBucket': destinationBucket,
                        'Key': sourceKey,
                        'Size': str(size)
                    })

                logger.debug('## BATCH_RESPONSE\r' +
                             jsonpickle.encode(dict(**pre_flight_response)))
                logger.debug("job submission complete")
                resultCode = 'Succeeded'

                detail = 'https://console.aws.amazon.com/batch/v2/home?region=' + os.environ[
                    'AWS_REGION'] + '#jobs/detail/' + response['jobId']
                resultString = detail
                resultCode = 'Succeeded'

        else:
            # <5GB
            copy_response = {}

            if (os.environ['IS_BUCKET_OWNER_FULL_CONTROL'] == 'FALSE'):
                copy_response = s3client.copy_object(Bucket=destinationBucket,
                                                     CopySource={
                                                         'Bucket':
                                                         sourceBucket,
                                                         'Key': sourceKey
                                                     },
                                                     Key=sourceKey)
            else:
                copy_response = s3client.copy_object(
                    Bucket=destinationBucket,
                    CopySource={
                        'Bucket': sourceBucket,
                        'Key': sourceKey
                    },
                    ACL='bucket-owner-full-control',
                    Key=sourceKey)

            logger.debug('## COPY_RESPONSE\r' +
                         jsonpickle.encode(dict(**copy_response)))
            resultString = 'Lambda copy complete'
            resultCode = 'Succeeded'

    except ClientError as e:
        # If request timed out, mark as a temp failure
        # and S3 Batch Operations will make the task for retry. If
        # any other exceptions are received, mark as permanent failure.
        errorCode = e.response['Error']['Code']
        errorMessage = e.response['Error']['Message']

        logger.debug(errorMessage)

        if errorCode == 'TooManyRequestsException':
            resultCode = 'TemporaryFailure'
            resultString = 'Retry request to batch due to throttling.'
        elif errorCode == 'RequestTimeout':
            resultCode = 'TemporaryFailure'
            resultString = 'Retry request to Amazon S3 due to timeout.'
        elif (errorCode == '304'):
            resultCode = 'Succeeded'
            resultString = 'Not modified'
        elif (errorCode == 'SlowDown'):
            resultCode = 'TemporaryFailure'
            resultString = 'Retry request to s3 due to throttling.'
        else:
            resultCode = 'PermanentFailure'
            resultString = '{}: {}'.format(errorCode, errorMessage)

    except Exception as e:
        # Catch all exceptions to permanently fail the task
        resultCode = 'PermanentFailure'
        resultString = 'Exception: {}'.format(e)

    finally:
        results.append({
            'taskId': taskId,
            'resultCode': resultCode,
            'resultString': resultString
        })
        logger.info(resultCode + " # " + resultString)

    return {
        'invocationSchemaVersion': invocationSchemaVersion,
        'treatMissingKeysAs': 'PermanentFailure',
        'invocationId': invocationId,
        'results': results
    }
Exemplo n.º 11
0
# Load names and RGB values from colorNames.csv into a list of dictionaries to
# be used by the colorByName() function. The columns, in order, are name, hex
colorDictList = []
with open("colorNames.csv", newline='') as csvColors:
    colorReader = csv.DictReader(csvColors)
    for row in colorReader:
        colorDictList.append(row)
csvColors.close()

# Normalize every name in the list of color dictionaries (replace all 'special'
# characters with their ASCII counterparts, e.g., á becomes a). While this may
# not be the ideal solution, it will work fine enough for these purposes.
for color in colorDictList:
    # If the color name is already normalized, skip it
    if unicodedata.is_normalized("NFD", color["name"]):
        continue
    # Normalize the color name and remove the decomposed non-ASCII strings
    norm = unicodedata.normalize("NFD", color["name"])
    encodedString =  norm.encode("ascii", "ignore")
    color["name"] = encodedString.decode()

# Searches through the list of colorDicts and returns the hex value for the
# given color if the name is found. If not, a NameError is raised
def findNamedColorHex(color):
    # These bounds are changed as the binary search occurs
    lowerBound = 0
    upperBound = len(colorDictList) - 1
    # Search until the bounds overlap
    while lowerBound <= upperBound:
    #for foo in range(0,20):
Exemplo n.º 12
0
def is_nfd(x):
    return is_normalized('NFD', x)
Exemplo n.º 13
0
def is_nfc(x):
    return is_normalized('NFC', x)
Exemplo n.º 14
0
def _submit_job(sourceBucket, sourceKey):

    logger.debug("preflight check start")

    #preflight checks _read_
    pre_flight_response = s3client.head_object(
        Bucket=sourceBucket,
        Key=sourceKey
    )

    logger.debug('## PREFLIGHT_RESPONSE\r' + jsonpickle.encode(dict(**pre_flight_response)))

    if 'DeleteMarker' in pre_flight_response:
        if  pre_flight_response['pre_flight_response'] == True:
            raise Exception('Object ' + sourceKey + ' is deleted')

    size = pre_flight_response['ContentLength']
    logger.debug("preflight check end")

    unsupportedStorageClass = False

    #Storage class check
    if 'StorageClass' in pre_flight_response:
        if pre_flight_response['StorageClass'] in ['GLACIER', 'DEEP_ARCHIVE']:
            #check restore status:
            if 'Restore' in pre_flight_response:
                restore = pre_flight_response['Restore']
                logger.debug(restore)
                if 'ongoing-request="false"' not in restore:
                    logger.info('restore is in progress')
                    raise Exception('Object ' + sourceKey + ' is restoring from '  + pre_flight_response['StorageClass'])
            else:
                unsupportedStorageClass = True

    if (unsupportedStorageClass):
        raise Exception('Object ' + sourceKey + ' is in unsupported StorageClass '  + pre_flight_response['StorageClass'])

    #NFC for unicodedata
    if unicodedata.is_normalized('NFC', sourceKey) == False:
        raise Exception('Object ' + sourceKey + ' is not in Normalized Form C' )

    # use bigger containers for 10GB+
    logger.debug("job submission start")
    jobDefinition = os.environ['JOB_SIZE_SMALL'] if pre_flight_response['ContentLength'] < int(os.environ['JOB_SIZE_THRESHOLD']) else os.environ['JOB_SIZE_LARGE']
    logger.debug("job definition is " + jobDefinition)

    logger.debug("job submission start")

    #submit job
    response = batchclient.submit_job(
        jobName="Fixity",
        jobQueue=os.environ['JOB_QUEUE'],
        jobDefinition=jobDefinition,
        parameters={
            'Bucket': sourceBucket,
            'Key': sourceKey
        },
        propagateTags=True,
        tags={
            'Bucket': sourceBucket,
            'Key': sourceKey,
            'Size': str(pre_flight_response['ContentLength'])
        }
    )

    logger.debug('## BATCH_RESPONSE\r' + jsonpickle.encode(dict(**response)))
    logger.debug("job submission complete")

    return response['jobId']