예제 #1
0
def make_label_sortable(label, roman=False):
    """ Make labels sortable, but converting them as appropriate.
    For example, "45Ai33b" becomes (45, "A", "i", 33, "b").
    Also, appendices have labels that look like 30(a), we make those
    appropriately sortable. """
    if roman:
        romans = list(itertools.islice(roman_nums(), 0, 50))
        return (1 + romans.index(label),)
    segments = _component_re.findall(label)
    return tuple(int(seg) if seg.isdigit() else seg for seg in segments)
    def test_roman_nums(self):
        first_3999 = list(itertools.islice(utils.roman_nums(), 0, 3999))
        self.assertEqual(['i', 'ii', 'iii', 'iv', 'v'], first_3999[:5])

        def assert_equal(str_value, idx):
            self.assertEqual(str_value, first_3999[idx - 1])

        assert_equal('xvii', 10 + 5 + 1 + 1)
        assert_equal('xlv', (50-10) + 5)
        assert_equal('dclv', 500 + 100 + 50 + 5)
        assert_equal('mcmxcvi', 1000 + (1000-100) + (100-10) + 5 + 1)
예제 #3
0
    def test_roman_nums(self):
        first_3999 = list(itertools.islice(utils.roman_nums(), 0, 3999))
        self.assertEqual(['i', 'ii', 'iii', 'iv', 'v'], first_3999[:5])

        def assert_equal(str_value, idx):
            self.assertEqual(str_value, first_3999[idx - 1])

        assert_equal('xvii', 10 + 5 + 1 + 1)
        assert_equal('xlv', (50 - 10) + 5)
        assert_equal('dclv', 500 + 100 + 50 + 5)
        assert_equal('mcmxcvi', 1000 + (1000 - 100) + (100 - 10) + 5 + 1)
예제 #4
0
def make_label_sortable(label, roman=False):
    """ Make labels sortable, but converting them as appropriate.
    Also, appendices have labels that look like 30(a), we make those
        appropriately sortable. """

    if label.isdigit():
        return (int(label),)
    if roman:
        romans = list(itertools.islice(roman_nums(), 0, 50))
        if label in romans:
            return (1 + romans.index(label),)

    # segment the label piece into component parts
    # e.g. 45Ai33b becomes (45, 'A', 'i', 33, 'b')
    INT, UPPER, LOWER = 1, 2, 3
    segments, segment, seg_type = [], "", None
    for ch in label:
        if ch.isdigit():
            ch_type = INT
        elif ch.isalpha() and ch == ch.upper():
            ch_type = UPPER
        elif ch.isalpha() and ch == ch.lower():
            ch_type = LOWER
        else:
            # other character, e.g. parens, guarantee segmentation
            ch_type = None

        if ch_type != seg_type and segment:     # new type of character
            segments.append(segment)
            segment = ""

        seg_type = ch_type
        if ch_type:
            segment += ch

    if segment:    # ended with something other than a paren
        segments.append(segment)

    segments = [int(seg) if seg.isdigit() else seg for seg in segments]
    return tuple(segments)
예제 #5
0
def make_label_sortable(label, roman=False):
    """ Make labels sortable, but converting them as appropriate.
    Also, appendices have labels that look like 30(a), we make those
        appropriately sortable. """

    if label.isdigit():
        return (int(label), )
    if roman:
        romans = list(itertools.islice(roman_nums(), 0, 50))
        if label in romans:
            return (1 + romans.index(label), )

    # segment the label piece into component parts
    # e.g. 45Ai33b becomes (45, 'A', 'i', 33, 'b')
    INT, UPPER, LOWER = 1, 2, 3
    segments, segment, seg_type = [], "", None
    for ch in label:
        if ch.isdigit():
            ch_type = INT
        elif ch.isalpha() and ch == ch.upper():
            ch_type = UPPER
        elif ch.isalpha() and ch == ch.lower():
            ch_type = LOWER
        else:
            # other character, e.g. parens, guarantee segmentation
            ch_type = None

        if ch_type != seg_type and segment:  # new type of character
            segments.append(segment)
            segment = ""

        seg_type = ch_type
        if ch_type:
            segment += ch

    if segment:  # ended with something other than a paren
        segments.append(segment)

    segments = [int(seg) if seg.isdigit() else seg for seg in segments]
    return tuple(segments)
import itertools
import re
import string
import sys

from regparser.tree import struct
from regparser.search import segments
from regparser.utils import roman_nums

p_levels = [
    list(string.ascii_lowercase),
    [str(i) for i in range(1, 51)],
    list(itertools.islice(roman_nums(), 0, 50)),
    list(string.ascii_uppercase),
    ['<E>' + str(i) + '</E>' for i in string.ascii_lowercase]
    #   Technically, there's italics (roman), but we aren't
    #   handling that yet
]


class ParagraphParser():

    def __init__(self, p_regex, node_type):
        """p_regex is the regular expression used when searching through
        paragraphs. It should contain a %s for the next paragraph 'part'
        (e.g. 'a', 'A', '1', 'i', etc.) inner_label_fn is a function which
        takes the current label, and the next paragraph 'part' and produces
        a new label."""
        self.p_regex = p_regex
        self.node_type = node_type
예제 #7
0
    return u'<E T="03">{}</E>'.format(marker_plain)


def deemphasize(marker):
    """Though the knowledge of emphasis is helpful for determining depth, it
    is _unhelpful_ in other scenarios, where we only care about the plain
    text. This function removes <E> tags"""
    return marker.replace('<E T="03">', '').replace('</E>', '')


lower = (tuple(string.ascii_lowercase) +
         tuple(a+a for a in string.ascii_lowercase if a != 'i'))
upper = (tuple(string.ascii_uppercase) +
         tuple(a+a for a in string.ascii_uppercase))
ints = tuple(str(i) for i in range(1, 999))
roman = tuple(itertools.islice(roman_nums(), 0, 50))
upper_roman = tuple(r.upper() for r in roman)
em_ints = tuple(emphasize(i) for i in ints)
em_roman = tuple(emphasize(i) for i in roman)


# Distinction between types of stars as it indicates how much space they can
# occupy
STARS_TAG = 'STARS'
INLINE_STARS = '* * *'
stars = (STARS_TAG, INLINE_STARS)

# Account for paragraphs without a marker at all
MARKERLESS = 'MARKERLESS'
markerless = (MARKERLESS,)
예제 #8
0
import itertools
import re
import string
import sys

from regparser.tree import struct
from regparser.search import segments
from regparser.utils import roman_nums

p_levels = [
    list(string.ascii_lowercase),
    [str(i) for i in range(1, 51)],
    list(itertools.islice(roman_nums(), 0, 50)),
    list(string.ascii_uppercase),
    ['<E T="03">' + str(i) + '</E>' for i in range(1, 51)],
    ['<E T="03">' + i + '</E>'
     for i in itertools.islice(roman_nums(), 0, 50)]
]


def p_level_of(marker):
    """Given a marker(string), determine the possible paragraph levels it
    could fall into. This is useful for determining the order of
    paragraphs"""
    potential_levels = []
    for level, markers in enumerate(p_levels):
        if marker in markers:
            potential_levels.append(level)
    return potential_levels

예제 #9
0
    return u'<E T="03">{}</E>'.format(marker_plain)


def deemphasize(marker):
    """Though the knowledge of emphasis is helpful for determining depth, it
    is _unhelpful_ in other scenarios, where we only care about the plain
    text. This function removes <E> tags"""
    return marker.replace('<E T="03">', '').replace('</E>', '')


lower = (tuple(string.ascii_lowercase) +
         tuple(a + a for a in string.ascii_lowercase if a != 'i'))
upper = (tuple(string.ascii_uppercase) +
         tuple(a + a for a in string.ascii_uppercase))
ints = tuple(str(i) for i in range(1, 999))
roman = tuple(itertools.islice(roman_nums(), 0, 50))
upper_roman = tuple(r.upper() for r in roman)
em_ints = tuple(emphasize(i) for i in ints)
em_roman = tuple(emphasize(i) for i in roman)

# Distinction between types of stars as it indicates how much space they can
# occupy
STARS_TAG = 'STARS'
INLINE_STARS = '* * *'
stars = (STARS_TAG, INLINE_STARS)

# Account for paragraphs without a marker at all
MARKERLESS = 'MARKERLESS'
markerless = (MARKERLESS, )

types = [
예제 #10
0
import itertools
import re
import string

from regparser.tree import struct
from regparser.search import segments
from regparser.utils import roman_nums

p_levels = [
    list(string.ascii_lowercase), [str(i) for i in range(1, 51)],
    list(itertools.islice(roman_nums(), 0, 50)),
    list(string.ascii_uppercase),
    ['<E T="03">' + str(i) + '</E>' for i in range(1, 51)],
    ['<E T="03">' + i + '</E>' for i in itertools.islice(roman_nums(), 0, 50)]
]


def p_level_of(marker):
    """Given a marker(string), determine the possible paragraph levels it
    could fall into. This is useful for determining the order of
    paragraphs"""
    potential_levels = []
    for level, markers in enumerate(p_levels):
        if marker in markers:
            potential_levels.append(level)
    return potential_levels


class ParagraphParser():
    def __init__(self, p_regex, node_type):
        """p_regex is the regular expression used when searching through