Пример #1
0
def test_validation_gufunc_arg_shapes():
    sig_template_1 = "(3),(%s)->()"
    sig_template_2 = "(x,4),(foo%s)->(5)"
    weird_chars = (1632, 1633, 1634, 65303, 65304, 65305)  # There are more

    for cc in weird_chars:
        assertInvalidArgument(gu.gufunc_arg_shapes, sig_template_1 % hunichr(cc))
        assertInvalidArgument(gu.gufunc_arg_shapes, sig_template_2 % hunichr(cc))
Пример #2
0
 def simplify(self, x):
     if x in self.ascii_characters:
         for i in hrange(self.ascii_characters.index(x), -1, -1):
             yield self.ascii_characters[i]
     else:
         o = ord(x)
         for c in reversed(self.ascii_characters):
             yield text_type(c)
         if o > 0:
             yield hunichr(o // 2)
             yield hunichr(o - 1)
Пример #3
0
 def _handle_character_sets(self, state):
     opcode, value = state
     if opcode == re.sre_parse.RANGE:
         return [hunichr(val) for val in hrange(value[0], value[1] + 1)]
     elif opcode == re.sre_parse.LITERAL:
         return [hunichr(value)]
     elif opcode == re.sre_parse.CATEGORY:
         return self._categories(value)
     else:
         raise NotImplementedError(
             "Unable to find character set: {0}".format(opcode))
def test_output_emitting_unicode(testdir, monkeypatch):
    monkeypatch.setenv("LC_ALL", "C")
    monkeypatch.setenv("LANG", "C")
    script = testdir.makepyfile(UNICODE_EMITTING)
    result = getattr(testdir, "runpytest_subprocess", testdir.runpytest)(
        script, "--verbose", "--capture=no"
    )
    out = "\n".join(result.stdout.lines)
    assert "test_emits_unicode" in out
    assert hunichr(1001) in out or escape_unicode_characters(hunichr(1001)) in out
    assert result.ret == 0
Пример #5
0
 def basic_simplify(self, random, x):
     if x in self.ascii_characters:
         for i in hrange(0, self.ascii_characters.index(x)):
             yield self.ascii_characters[i]
     else:
         o = ord(x)
         for c in self.ascii_characters:
             yield text_type(c)
         yield hunichr(o // 2)
         for t in hrange(o - 1, max(o - 10, -1), -1):
             yield hunichr(t)
Пример #6
0
def test_output_emitting_unicode(testdir, monkeypatch):
    monkeypatch.setenv('LC_ALL', 'C')
    monkeypatch.setenv('LANG', 'C')
    script = testdir.makepyfile(UNICODE_EMITTING)
    result = getattr(testdir, 'runpytest_subprocess',
                     testdir.runpytest)(script, '--verbose', '--capture=no')
    out = '\n'.join(result.stdout.lines)
    assert 'test_emits_unicode' in out
    assert hunichr(1001) in out or \
        escape_unicode_characters(hunichr(1001)) in out
    assert result.ret == 0
Пример #7
0
def test_output_emitting_unicode(testdir, monkeypatch):
    monkeypatch.setenv("LC_ALL", "C")
    monkeypatch.setenv("LANG", "C")
    script = testdir.makepyfile(UNICODE_EMITTING)
    result = getattr(testdir, "runpytest_subprocess",
                     testdir.runpytest)(script, "--verbose", "--capture=no")
    out = "\n".join(result.stdout.lines)
    assert "test_emits_unicode" in out
    assert hunichr(1001) in out or escape_unicode_characters(
        hunichr(1001)) in out
    assert result.ret == 0
Пример #8
0
def test_output_emitting_unicode(testdir, monkeypatch):
    monkeypatch.setenv('LC_ALL', 'C')
    monkeypatch.setenv('LANG', 'C')
    script = testdir.makepyfile(UNICODE_EMITTING)
    result = getattr(
        testdir, 'runpytest_subprocess', testdir.runpytest)(
        script, '--verbose', '--capture=no')
    out = '\n'.join(result.stdout.lines)
    assert 'test_emits_unicode' in out
    assert hunichr(1001) in out or \
        escape_unicode_characters(hunichr(1001)) in out
    assert result.ret == 0
Пример #9
0
        def accept(random, template):
            x = ord(template)
            if x <= lo:
                return

            lb = lo
            while True:
                yield hunichr(lb)
                new_lb = (lb + x) // 2
                if new_lb <= lb or new_lb >= hi:
                    return
                if new_lb > lb + 2:
                    yield hunichr(random.randint(lb + 1, new_lb - 1))
                lb = new_lb
Пример #10
0
        def accept(random, template):
            x = ord(template)
            if x <= lo:
                return

            lb = lo
            while True:
                yield hunichr(lb)
                new_lb = (lb + x) // 2
                if new_lb <= lb or new_lb >= hi:
                    return
                if new_lb > lb + 2:
                    yield hunichr(random.randint(lb + 1, new_lb - 1))
                lb = new_lb
Пример #11
0
    def draw_parameter(self, random):
        alphabet_size = 1 + dist.geometric(random, 0.1)
        alphabet = []
        buckets = 10
        ascii_chance = random.randint(1, buckets)
        if ascii_chance < buckets:
            space_chance = random.randint(1, buckets - ascii_chance)
        else:
            space_chance = 0
        while len(alphabet) < alphabet_size:
            choice = random.randint(1, buckets)
            if choice <= ascii_chance:
                codepoint = dist.geometric(random, 1.0 / 127)
            elif choice <= ascii_chance + space_chance:
                while True:
                    i = dist.geometric(random, 2 / len(_spaces))
                    if i < len(_spaces):
                        codepoint = _spaces[i]
                        break
            else:
                codepoint = random.randint(0, sys.maxunicode)

            char = hunichr(codepoint)
            if self.is_good(char):
                alphabet.append(char)
        if u'\n' not in alphabet and not random.randint(0, 10):
            alphabet.append(u'\n')
        return tuple(alphabet)
Пример #12
0
def charmap():
    global _charmap
    if _charmap is None:
        f = charmap_file()
        if not os.path.exists(f):
            _charmap = {}
            for i in range(0, sys.maxunicode + 1):
                cat = unicodedata.category(hunichr(i))
                rs = _charmap.setdefault(cat, [])
                if rs and rs[-1][-1] == i - 1:
                    rs[-1][-1] += 1
                else:
                    rs.append([i, i])
            # We explicitly set the mtime to an arbitary value so as to get
            # a stable format for our charmap.
            data = sorted(
                (k, tuple((map(tuple, v))))
                for k, v in _charmap.items())

            # Write the Unicode table atomically
            fd, tmpfile = tempfile.mkstemp()
            os.close(fd)
            with GzipFile(tmpfile, 'wb', mtime=1) as o:
                o.write(pickle.dumps(data, pickle.HIGHEST_PROTOCOL))
            os.rename(tmpfile, f)

        with GzipFile(f, 'rb') as i:
            _charmap = dict(pickle.loads(i.read()))
    assert _charmap is not None
    return _charmap
Пример #13
0
def charmap():
    global _charmap
    if _charmap is None:
        f = charmap_file()
        if not os.path.exists(f):
            tmp_charmap = {}
            for i in range(0, sys.maxunicode + 1):
                cat = unicodedata.category(hunichr(i))
                rs = tmp_charmap.setdefault(cat, [])
                if rs and rs[-1][-1] == i - 1:
                    rs[-1][-1] += 1
                else:
                    rs.append([i, i])
            # We explicitly set the mtime to an arbitary value so as to get
            # a stable format for our charmap.
            data = sorted((k, tuple((map(tuple, v)))) for k, v in tmp_charmap.items())

            # Write the Unicode table atomically
            fd, tmpfile = tempfile.mkstemp(dir=tmpdir())
            os.close(fd)
            with GzipFile(tmpfile, "wb", mtime=1) as o:
                o.write(pickle.dumps(data, pickle.HIGHEST_PROTOCOL))
            try:
                os.rename(tmpfile, f)
            except FileExistsError:  # pragma: no cover
                # This exception is only raised on Windows, and coverage is
                # measured on Linux.
                pass
        with GzipFile(f, "rb") as i:
            _charmap = dict(pickle.loads(i.read()))
    assert _charmap is not None
    return _charmap
Пример #14
0
def test_charmap_has_right_categories():
    for cat, intervals in cm.charmap().items():
        for u, v in intervals:
            for i in range(u, v + 1):
                real = unicodedata.category(hunichr(i))
                assert real == cat, \
                    '%d is %s but reported in %s' % (i, real, cat)
Пример #15
0
def charmap():
    global _charmap
    if _charmap is None:
        f = charmap_file()
        if not os.path.exists(f):
            _charmap = {}
            for i in range(0, sys.maxunicode + 1):
                cat = unicodedata.category(hunichr(i))
                rs = _charmap.setdefault(cat, [])
                if rs and rs[-1][-1] == i - 1:
                    rs[-1][-1] += 1
                else:
                    rs.append([i, i])
            # We explicitly set the mtime to an arbitary value so as to get
            # a stable format for our charmap.
            data = sorted(
                (k, tuple((map(tuple, v)))) for k, v in _charmap.items())

            # Write the Unicode table atomically
            fd, tmpfile = tempfile.mkstemp(dir=tmpdir())
            os.close(fd)
            with GzipFile(tmpfile, 'wb', mtime=1) as o:
                o.write(pickle.dumps(data, pickle.HIGHEST_PROTOCOL))
            os.rename(tmpfile, f)

        with GzipFile(f, 'rb') as i:
            _charmap = dict(pickle.loads(i.read()))
    assert _charmap is not None
    return _charmap
Пример #16
0
    def draw_parameter(self, random):
        alphabet_size = 1 + dist.geometric(random, 0.1)
        alphabet = []
        buckets = 10
        ascii_chance = random.randint(1, buckets)
        if ascii_chance < buckets:
            space_chance = random.randint(1, buckets - ascii_chance)
        else:
            space_chance = 0
        while len(alphabet) < alphabet_size:
            choice = random.randint(1, buckets)
            if choice <= ascii_chance:
                codepoint = dist.geometric(random, 1.0 / 127)
            elif choice <= ascii_chance + space_chance:
                while True:
                    i = dist.geometric(random, 2 / len(_spaces))
                    if i < len(_spaces):
                        codepoint = _spaces[i]
                        break
            else:
                codepoint = random.randint(0, sys.maxunicode)

            char = hunichr(codepoint)
            if self.is_good(char):
                alphabet.append(char)
        if u'\n' not in alphabet and not random.randint(0, 10):
            alphabet.append(u'\n')
        return tuple(alphabet)
Пример #17
0
def test_charmap_has_right_categories():
    for cat, intervals in cm.charmap().items():
        for u, v in intervals:
            for i in range(u, v + 1):
                real = unicodedata.category(hunichr(i))
                assert real == cat, \
                    '%d is %s but reported in %s' % (i, real, cat)
 def do_draw(self, data):
     i = integer_range(
         data,
         0,
         len(self.intervals) - 1,
         center=self.zero_point,
     )
     return hunichr(self.intervals[i])
Пример #19
0
 def produce_parameter(self, random):
     alphabet_size = 1 + dist.geometric(random, 0.1)
     alphabet = []
     while len(alphabet) < alphabet_size:
         char = hunichr(random.randint(0, sys.maxunicode))
         if unicodedata.category(char) != 'Cs':
             alphabet.append(char)
     return tuple(alphabet)
Пример #20
0
 def produce(self, random, pv):
     if dist.biased_coin(random, pv.ascii_chance):
         return random.choice(self.ascii_characters)
     else:
         while True:
             result = hunichr(random.randint(0, sys.maxunicode))
             if unicodedata.category(result) != 'Cs':
                 return result
Пример #21
0
 def do_draw(self, data):
     while True:
         i = integer_range(
             data, 0, len(self.intervals) - 1,
             center=self.zero_point,
         )
         c = hunichr(self.intervals[i])
         if c not in self.blacklist_characters:
             return c
Пример #22
0
        def accept(random, template):
            x = ord(template)
            if x <= lo:
                return

            lb = lo
            while True:
                c = hunichr(lb)
                if self.is_good(c):
                    yield c
                new_lb = (lb + x) // 2
                if new_lb <= lb or new_lb >= hi:
                    return
                if new_lb > lb + 2:
                    c = hunichr(random.randint(lb + 1, new_lb - 1))
                    if self.is_good(c):
                        yield c
                lb = new_lb
Пример #23
0
def test_unicode_tree_categories():
    tree = charstree.unicode_tree()
    expected = list(
        set([
            unicodedata.category(hunichr(i))
            for i in range(0, sys.maxunicode + 1)
        ]))
    actual = charstree.categories(tree)
    assert sorted(expected) == sorted(actual)
Пример #24
0
def test_query_matches_categories(exclude, include):
    values = cm.query(exclude, include)
    assert_valid_range_list(values)
    for u, v in values:
        for i in (u, v, (u + v) // 2):
            cat = unicodedata.category(hunichr(i))
            if include is not None:
                assert cat in include
            assert cat not in exclude
Пример #25
0
        def accept(random, template):
            x = ord(template)
            if x <= lo:
                return

            lb = lo
            while True:
                c = hunichr(lb)
                if self.is_good(c):
                    yield c
                new_lb = (lb + x) // 2
                if new_lb <= lb or new_lb >= hi:
                    return
                if new_lb > lb + 2:
                    c = hunichr(random.randint(lb + 1, new_lb - 1))
                    if self.is_good(c):
                        yield c
                lb = new_lb
Пример #26
0
def test_query_matches_categories(exclude, include):
    values = cm.query(exclude, include)
    assert_valid_range_list(values)
    for u, v in values:
        for i in (u, v, (u + v) // 2):
            cat = unicodedata.category(hunichr(i))
            if include is not None:
                assert cat in include
            assert cat not in exclude
Пример #27
0
def charmap():
    """Return a dict that maps a Unicode category, to a tuple of 2-tuples
    covering the codepoint intervals for characters in that category.

    >>> charmap()['Co']
    ((57344, 63743), (983040, 1048573), (1048576, 1114109))
    """
    global _charmap
    # Best-effort caching in the face of missing files and/or unwritable
    # filesystems is fairly simple: check if loaded, else try loading,
    # else calculate and try writing the cache.
    if _charmap is None:
        f = charmap_file()
        try:
            with gzip.GzipFile(f, "rb") as i:
                tmp_charmap = dict(json.loads(i))

        except Exception:
            tmp_charmap = {}
            for i in range(0, sys.maxunicode + 1):
                cat = unicodedata.category(hunichr(i))
                rs = tmp_charmap.setdefault(cat, [])
                if rs and rs[-1][-1] == i - 1:
                    rs[-1][-1] += 1
                else:
                    rs.append([i, i])

            try:
                # Write the Unicode table atomically
                tmpdir = storage_directory("tmp")
                mkdir_p(tmpdir)
                fd, tmpfile = tempfile.mkstemp(dir=tmpdir)
                os.close(fd)
                # Explicitly set the mtime to get reproducible output
                with gzip.GzipFile(tmpfile, "wb", mtime=1) as o:
                    result = json.dumps(sorted(tmp_charmap.items()))
                    o.write(result.encode())

                os.renames(tmpfile, f)
            except Exception:
                pass

        # convert between lists and tuples
        _charmap = {
            k: tuple(tuple(pair) for pair in pairs) for k, pairs in tmp_charmap.items()
        }
        # each value is a tuple of 2-tuples (that is, tuples of length 2)
        # and that both elements of that tuple are integers.
        for vs in _charmap.values():
            ints = list(sum(vs, ()))
            assert all([isinstance(x, int) for x in ints])
            assert ints == sorted(ints)
            assert all([len(tup) == 2 for tup in vs])

    assert _charmap is not None
    return _charmap
Пример #28
0
 def new_tree():
     tree = OrderedDict()
     for codepoint in hrange(0, sys.maxunicode + 1):
         cat = unicodedata.category(hunichr(codepoint))
         target = tree.setdefault(cat, [])
         if target and codepoint == target[-1][-1] + 1:
             target[-1][-1] += 1
         else:
             target.append([codepoint, codepoint])
     return tree
Пример #29
0
 def new_tree():
     tree = OrderedDict()
     for codepoint in hrange(0, sys.maxunicode + 1):
         cat = unicodedata.category(hunichr(codepoint))
         target = tree.setdefault(cat, [])
         if target and codepoint == target[-1][-1] + 1:
             target[-1][-1] += 1
         else:
             target.append([codepoint, codepoint])
     return tree
Пример #30
0
    def try_ascii(self, random, template):
        if template < u'0':
            for i in hrange(ord(template) + 1, self.zero_point + 1):
                yield hunichr(i)

        for i in self.ascii_characters:
            if i < u'0':
                continue
            if i >= template:
                break
            yield i
Пример #31
0
 def do_draw(self, data):
     while True:
         i = integer_range(
             data,
             0,
             len(self.intervals) - 1,
             center=self.zero_point,
         )
         c = hunichr(self.intervals[i])
         if c not in self.blacklist_characters:
             return c
Пример #32
0
def charmap():
    """Return a dict that maps a Unicode category, to a tuple of 2-tuples
    covering the codepoint intervals for characters in that category.

    >>> charmap()['Co']
    ((57344, 63743), (983040, 1048573), (1048576, 1114109))
    """
    global _charmap
    # Best-effort caching in the face of missing files and/or unwritable
    # filesystems is fairly simple: check if loaded, else try loading,
    # else calculate and try writing the cache.
    if _charmap is None:
        f = charmap_file()
        try:
            with gzip.GzipFile(f, "rb") as i:
                tmp_charmap = dict(json.loads(i))

        except Exception:
            tmp_charmap = {}
            for i in range(0, sys.maxunicode + 1):
                cat = unicodedata.category(hunichr(i))
                rs = tmp_charmap.setdefault(cat, [])
                if rs and rs[-1][-1] == i - 1:
                    rs[-1][-1] += 1
                else:
                    rs.append([i, i])

            try:
                # Write the Unicode table atomically
                fd, tmpfile = tempfile.mkstemp(dir=tmpdir())
                os.close(fd)
                # Explicitly set the mtime to get reproducible output
                with gzip.GzipFile(tmpfile, "wb", mtime=1) as o:
                    result = json.dumps(sorted(tmp_charmap.items()))
                    o.write(result.encode())

                os.rename(tmpfile, f)
            except Exception:
                pass

        # convert between lists and tuples
        _charmap = {
            k: tuple(tuple(pair) for pair in pairs) for k, pairs in tmp_charmap.items()
        }
        # each value is a tuple of 2-tuples (that is, tuples of length 2)
        # and that both elements of that tuple are integers.
        for vs in _charmap.values():
            ints = list(sum(vs, ()))
            assert all([isinstance(x, int) for x in ints])
            assert ints == sorted(ints)
            assert all([len(tup) == 2 for tup in vs])

    assert _charmap is not None
    return _charmap
Пример #33
0
    def try_ascii(self, random, template):
        if template < '0':
            for i in hrange(ord(template) + 1, self.zero_point + 1):
                yield hunichr(i)

        for i in self.ascii_characters:
            if i < '0':
                continue
            if i >= template:
                break
            yield i
    def draw_parameter(self, random):
        alphabet_size = 1 + dist.geometric(random, 0.1)
        alphabet = []
        while len(alphabet) < alphabet_size:
            if random.randint(0, 10):
                codepoint = random.randint(0, sys.maxunicode)
            else:
                codepoint = dist.geometric(random, 1.0 / 127)

            char = hunichr(codepoint)
            if unicodedata.category(char) != 'Cs':
                alphabet.append(char)
        return tuple(alphabet)
Пример #35
0
    def draw_parameter(self, random):
        alphabet_size = 1 + dist.geometric(random, 0.1)
        alphabet = []
        while len(alphabet) < alphabet_size:
            if random.randint(0, 10):
                codepoint = random.randint(0, sys.maxunicode)
            else:
                codepoint = dist.geometric(random, 1.0 / 127)

            char = hunichr(codepoint)
            if self.is_good(char):
                alphabet.append(char)
        return tuple(alphabet)
Пример #36
0
def _test_matching_pattern(pattern, isvalidchar, is_unicode=False):
    r = unicode_regex(pattern) if is_unicode else ascii_regex(pattern)

    codepoints = hrange(0, sys.maxunicode + 1) \
        if is_unicode else hrange(1, 128)
    for c in [hunichr(x) for x in codepoints]:
        if isvalidchar(c):
            assert r.search(c), (
                '"%s" supposed to match "%s" (%r, category "%s"), '
                "but it doesn't" % (pattern, c, c, unicodedata.category(c)))
        else:
            assert not r.search(c), (
                '"%s" supposed not to match "%s" (%r, category "%s"), '
                'but it does' % (pattern, c, c, unicodedata.category(c)))
def _test_matching_pattern(pattern, isvalidchar, is_unicode=False):
    r = unicode_regex(pattern) if is_unicode else ascii_regex(pattern)

    codepoints = hrange(0, sys.maxunicode + 1) if is_unicode else hrange(1, 128)
    for c in [hunichr(x) for x in codepoints]:
        if isvalidchar(c):
            assert r.search(c), (
                '"%s" supposed to match "%s" (%r, category "%s"), '
                "but it doesn't" % (pattern, c, c, unicodedata.category(c))
            )
        else:
            assert not r.search(c), (
                '"%s" supposed not to match "%s" (%r, category "%s"), '
                "but it does" % (pattern, c, c, unicodedata.category(c))
            )
Пример #38
0
def charmap():
    """Return a dict that maps a Unicode category, to a tuple of 2-tuples
    covering the codepoint intervals for characters in that category.

    >>> charmap()['Co']
    ((57344, 63743), (983040, 1048573), (1048576, 1114109))

    """
    global _charmap
    # Best-effort caching in the face of missing files and/or unwritable
    # filesystems is fairly simple: check if loaded, else try loading,
    # else calculate and try writing the cache.
    if _charmap is None:
        f = charmap_file()
        try:
            with gzip.GzipFile(f, 'rb') as i:
                _charmap = dict(pickle.load(i))

        except Exception:
            tmp_charmap = {}
            for i in range(0, sys.maxunicode + 1):
                cat = unicodedata.category(hunichr(i))
                rs = tmp_charmap.setdefault(cat, [])
                if rs and rs[-1][-1] == i - 1:
                    rs[-1][-1] += 1
                else:
                    rs.append([i, i])
            _charmap = {
                k: tuple(tuple(pair) for pair in pairs)
                for k, pairs in tmp_charmap.items()
            }

            try:
                # Write the Unicode table atomically
                fd, tmpfile = tempfile.mkstemp(dir=tmpdir())
                os.close(fd)
                # Explicitly set the mtime to get reproducible output
                with gzip.GzipFile(tmpfile, 'wb', mtime=1) as o:
                    pickle.dump(sorted(_charmap.items()), o,
                                pickle.HIGHEST_PROTOCOL)
                os.rename(tmpfile, f)
            except Exception:  # pragma: no cover
                pass
    assert _charmap is not None
    return _charmap
Пример #39
0
def charmap():
    """Return a dict that maps a Unicode category, to a tuple of 2-tuples
    covering the codepoint intervals for characters in that category.

    >>> charmap()['Co']
    ((57344, 63743), (983040, 1048573), (1048576, 1114109))

    """
    global _charmap
    # Best-effort caching in the face of missing files and/or unwritable
    # filesystems is fairly simple: check if loaded, else try loading,
    # else calculate and try writing the cache.
    if _charmap is None:
        f = charmap_file()
        try:
            with gzip.GzipFile(f, 'rb') as i:
                _charmap = dict(pickle.load(i))

        except Exception:
            tmp_charmap = {}
            for i in range(0, sys.maxunicode + 1):
                cat = unicodedata.category(hunichr(i))
                rs = tmp_charmap.setdefault(cat, [])
                if rs and rs[-1][-1] == i - 1:
                    rs[-1][-1] += 1
                else:
                    rs.append([i, i])
            _charmap = {k: tuple((map(tuple, v)))
                        for k, v in tmp_charmap.items()}

            try:
                # Write the Unicode table atomically
                fd, tmpfile = tempfile.mkstemp(dir=tmpdir())
                os.close(fd)
                # Explicitly set the mtime to get reproducible output
                with gzip.GzipFile(tmpfile, 'wb', mtime=1) as o:
                    pickle.dump(sorted(_charmap.items()), o,
                                pickle.HIGHEST_PROTOCOL)
                os.rename(tmpfile, f)
            except Exception:  # pragma: no cover
                pass
    assert _charmap is not None
    return _charmap
Пример #40
0
    def try_ascii(self, random, template):
        tree = self.ascii_tree

        if not tree:
            return

        zero_point = self.zero_point
        template = ord(template)

        if template < zero_point:
            min_codepoint, max_codepoint = template, zero_point
        elif template > zero_point:
            min_codepoint, max_codepoint = zero_point, template
        else:
            return

        subtree = charstree.filter_tree(tree, min_codepoint=min_codepoint, max_codepoint=max_codepoint)

        for codepoint in charstree.codepoints(subtree):
            yield hunichr(codepoint)
Пример #41
0
def charmap():
    """Return a dict that maps a Unicode category, to a tuple of 2-tuples
    covering the codepoint intervals for characters in that category.

    >>> charmap()['Co']
    ((57344, 63743), (983040, 1048573), (1048576, 1114109))

    """
    global _charmap
    if _charmap is None:
        f = charmap_file()
        if not os.path.exists(f):
            tmp_charmap = {}
            for i in range(0, sys.maxunicode + 1):
                cat = unicodedata.category(hunichr(i))
                rs = tmp_charmap.setdefault(cat, [])
                if rs and rs[-1][-1] == i - 1:
                    rs[-1][-1] += 1
                else:
                    rs.append([i, i])
            # We explicitly set the mtime to an arbitrary value so as to get
            # a stable format for our charmap.
            data = sorted(
                (k, tuple((map(tuple, v)))) for k, v in tmp_charmap.items())

            # Write the Unicode table atomically
            fd, tmpfile = tempfile.mkstemp(dir=tmpdir())
            os.close(fd)
            with GzipFile(tmpfile, 'wb', mtime=1) as o:
                o.write(pickle.dumps(data, pickle.HIGHEST_PROTOCOL))
            try:
                os.rename(tmpfile, f)
            except FileExistsError:  # pragma: no cover
                # This exception is only raised on Windows, and coverage is
                # measured on Linux.
                pass
        with GzipFile(f, 'rb') as i:
            _charmap = dict(pickle.loads(i.read()))
    assert _charmap is not None
    return _charmap
Пример #42
0
def do_filter_tree_by_characters(tree, blacklist_characters, acc, base):
    if not blacklist_characters:
        return tree

    for key, value in tree.items():
        this_base = base + key

        index = bisect.bisect(blacklist_characters, hunichr(this_base))
        characters = blacklist_characters[index - 1 if index else 0:]

        if isinstance(value, dict):
            subtree = do_filter_tree_by_characters(
                value, characters, OrderedDict(), this_base)
            if subtree:
                acc[key] = subtree
        else:
            filtered_value = value
            for character in characters:
                codepoint = ord(character)
                value_acc = []
                for item in filtered_value:
                    locp, hicp = item[0] + this_base, item[1] + this_base
                    if locp == codepoint == hicp:
                        continue
                    elif not (locp <= codepoint <= hicp):
                        value_acc.append(item)
                    elif locp == codepoint:
                        item = (codepoint + 1 - this_base, item[1])
                        value_acc.append(item)
                    elif hicp == codepoint:
                        item = (item[0], codepoint - 1 - this_base)
                        value_acc.append(item)
                    else:
                        value_acc.append((item[0], codepoint - 1 - this_base))
                        value_acc.append((codepoint + 1 - this_base, item[1]))
                filtered_value = value_acc
            if filtered_value:
                acc[key] = tuple(filtered_value)
    return acc
    def try_ascii(self, random, template):
        tree = self.ascii_tree

        if not tree:
            return

        zero_point = self.zero_point
        template = ord(template)

        if template < zero_point:
            min_codepoint, max_codepoint = template, zero_point
        elif template > zero_point:
            min_codepoint, max_codepoint = zero_point, template
        else:
            return

        subtree = charstree.filter_tree(tree,
                                        min_codepoint=min_codepoint,
                                        max_codepoint=max_codepoint)

        for codepoint in charstree.codepoints(subtree):
            yield hunichr(codepoint)
Пример #44
0
def do_filter_tree_by_characters(tree, blacklist_characters, acc, base):
    if not blacklist_characters:
        return tree

    for key, value in tree.items():
        this_base = base + key

        index = bisect.bisect(blacklist_characters, hunichr(this_base))
        characters = blacklist_characters[index - 1 if index else 0:]

        if isinstance(value, dict):
            subtree = do_filter_tree_by_characters(value, characters,
                                                   OrderedDict(), this_base)
            if subtree:
                acc[key] = subtree
        else:
            filtered_value = value
            for character in characters:
                codepoint = ord(character)
                value_acc = []
                for item in filtered_value:
                    locp, hicp = item[0] + this_base, item[1] + this_base
                    if locp == codepoint == hicp:
                        continue
                    elif not (locp <= codepoint <= hicp):
                        value_acc.append(item)
                    elif locp == codepoint:
                        item = (codepoint + 1 - this_base, item[1])
                        value_acc.append(item)
                    elif hicp == codepoint:
                        item = (item[0], codepoint - 1 - this_base)
                        value_acc.append(item)
                    else:
                        value_acc.append((item[0], codepoint - 1 - this_base))
                        value_acc.append((codepoint + 1 - this_base, item[1]))
                filtered_value = value_acc
            if filtered_value:
                acc[key] = tuple(filtered_value)
    return acc
Пример #45
0
    def do_draw(self, data):
        denom = math.log1p(-1 / 127)

        def d(random):
            if self.special and random.randint(0, 10) == 0:
                return random.choice(self.special)
            if len(self.intervals) <= 256 or random.randint(0, 1):
                i = random.randint(0, len(self.intervals.offsets) - 1)
                u, v = self.intervals.intervals[i]
                return self.intervals.offsets[i] + random.randint(0, v - u + 1)
            else:
                return min(
                    len(self.intervals) - 1,
                    int(math.log(random.random()) / denom))

        while True:
            i = integer_range(
                data, 0, len(self.intervals) - 1,
                center=self.zero_point, distribution=d
            )
            c = hunichr(self.intervals[i])
            if c not in self.blacklist_characters:
                return c
Пример #46
0
    def draw_parameter(self, random):
        ascii_categories = charstree.categories(self.ascii_tree)
        unicode_categories = charstree.categories(self.unicode_tree)
        spaces_categories = charstree.categories(self.spaces_tree)

        alphabet_size = 1 + dist.geometric(random, 0.1)
        alphabet = []
        buckets = 10
        ascii_chance = random.randint(1, buckets)

        if spaces_categories and ascii_chance < buckets:
            space_chance = random.randint(1, buckets - ascii_chance)
        else:
            space_chance = 0

        while len(alphabet) < alphabet_size:
            choice = random.randint(1, buckets)

            if ascii_categories and choice <= ascii_chance:
                category = random.choice(ascii_categories)
                tree = self.ascii_tree
            elif spaces_categories and choice <= ascii_chance + space_chance:
                category = random.choice(spaces_categories)
                tree = self.spaces_tree
            else:
                category = random.choice(unicode_categories)
                tree = self.unicode_tree

            codepoint = charstree.random_codepoint(tree, category, random)
            alphabet.append(hunichr(codepoint))

        if u'\n' not in alphabet and not random.randint(0, 6):
            if self.is_good(u'\n'):
                alphabet.append(u'\n')

        return tuple(alphabet)
    def draw_parameter(self, random):
        ascii_categories = charstree.categories(self.ascii_tree)
        unicode_categories = charstree.categories(self.unicode_tree)
        spaces_categories = charstree.categories(self.spaces_tree)

        alphabet_size = 1 + dist.geometric(random, 0.1)
        alphabet = []
        buckets = 10
        ascii_chance = random.randint(1, buckets)

        if spaces_categories and ascii_chance < buckets:
            space_chance = random.randint(1, buckets - ascii_chance)
        else:
            space_chance = 0

        while len(alphabet) < alphabet_size:
            choice = random.randint(1, buckets)

            if ascii_categories and choice <= ascii_chance:
                category = random.choice(ascii_categories)
                tree = self.ascii_tree
            elif spaces_categories and choice <= ascii_chance + space_chance:
                category = random.choice(spaces_categories)
                tree = self.spaces_tree
            else:
                category = random.choice(unicode_categories)
                tree = self.unicode_tree

            codepoint = charstree.random_codepoint(tree, category, random)
            alphabet.append(hunichr(codepoint))

        if u'\n' not in alphabet and not random.randint(0, 6):
            if self.is_good(u'\n'):
                alphabet.append(u'\n')

        return tuple(alphabet)
Пример #48
0
from __future__ import division, print_function, absolute_import, \
    unicode_literals

import sys
import unicodedata

import hypothesis.internal.distributions as dist
from hypothesis.internal.compat import hrange, hunichr, text_type, \
    binary_type
from hypothesis.searchstrategy.strategies import SearchStrategy, \
    MappedSearchStrategy, check_length, check_data_type

_spaces = [
    i for i in range(sys.maxunicode)
    if unicodedata.category(hunichr(i)) in ('Cc', 'Zs')
]


class OneCharStringStrategy(SearchStrategy):

    """A strategy which generates single character strings of text type."""
    specifier = text_type
    ascii_characters = ''.join(
        chr(i) for i in hrange(128)
    )
    zero_point = ord('0')

    def draw_parameter(self, random):
        alphabet_size = 1 + dist.geometric(random, 0.1)
        alphabet = []
Пример #49
0
def test_exclude_only_excludes_from_that_category(cat, i):
    c = hunichr(i)
    assume(unicodedata.category(c) != cat)
    intervals = cm.query(exclude_categories=(cat, ))
    assert any(a <= i <= b for a, b in intervals)
Пример #50
0
 def do_draw(self, data):
     i = integer_range(data, 0, len(self.intervals) - 1, center=self.zero_point)
     return hunichr(self.intervals[i])
Пример #51
0
# END HEADER

from __future__ import division, print_function, absolute_import

import sys
import unicodedata

import hypothesis.internal.distributions as dist
from hypothesis.internal.compat import hrange, hunichr, text_type, \
    binary_type
from hypothesis.searchstrategy.strategies import check_length, \
    SearchStrategy, check_data_type, MappedSearchStrategy

_spaces = [
    i for i in range(sys.maxunicode)
    if unicodedata.category(hunichr(i)) in (u'Cc', u'Zs')
]


class OneCharStringStrategy(SearchStrategy):

    """A strategy which generates single character strings of text type."""
    specifier = text_type
    ascii_characters = u''.join(
        chr(i) for i in hrange(128)
    )
    zero_point = ord(u'0')

    def draw_parameter(self, random):
        alphabet_size = 1 + dist.geometric(random, 0.1)
        alphabet = []
Пример #52
0
# END HEADER

from __future__ import division, print_function, absolute_import

import sys
import unicodedata

import hypothesis.internal.distributions as dist
from hypothesis.internal.compat import hrange, hunichr, text_type, \
    binary_type
from hypothesis.searchstrategy.strategies import SearchStrategy, \
    MappedSearchStrategy, check_length, check_data_type

_spaces = [
    i for i in range(sys.maxunicode)
    if unicodedata.category(hunichr(i)) in (u'Cc', u'Zs')
]


class OneCharStringStrategy(SearchStrategy):
    """A strategy which generates single character strings of text type."""
    specifier = text_type
    ascii_characters = u''.join(chr(i) for i in hrange(128))
    zero_point = ord(u'0')

    def draw_parameter(self, random):
        alphabet_size = 1 + dist.geometric(random, 0.1)
        alphabet = []
        buckets = 10
        ascii_chance = random.randint(1, buckets)
        if ascii_chance < buckets:
Пример #53
0
def test_exclude_only_excludes_from_that_category(cat, i):
    c = hunichr(i)
    assume(unicodedata.category(c) != cat)
    intervals = cm.query(exclude_categories=(cat,))
    assert any(a <= i <= b for a, b in intervals)