def collapsed_markers_matches(node_text, tagged_text):
    """Find collapsed markers, i.e. tree node paragraphs that begin within a
    single XML node, within this text. Remove citations and other false
    positives. This is pretty hacky right now -- it focuses on the plain
    text but takes cues from the tagged text. @todo: streamline logic"""
    # In addition to the regex above, keyterms are an acceptable prefix. We
    # therefore convert keyterms to satisfy the above regex
    node_for_keyterms = Node(node_text, node_type=Node.INTERP,
                             label=[get_first_interp_marker(node_text)])
    node_for_keyterms.tagged_text = tagged_text
    keyterm = KeyTerms.get_keyterm(node_for_keyterms)
    if keyterm:
        node_text = node_text.replace(keyterm, '.'*len(keyterm))

    collapsed_markers = []
    for marker in _first_markers:
        possible = ((m, m.start(), m.end())
                    for m in marker.finditer(node_text) if m.start() > 0)
        possible = remove_citation_overlaps(node_text, possible)
        # If certain characters follow, kill it
        for following in ("e.", ")", u"”", '"', "'"):
            possible = [(m, s, end) for m, s, end in possible
                        if not node_text[end:].startswith(following)]
        possible = [m for m, _, _ in possible]
        # As all "1." collapsed markers must be emphasized, run a quick
        # check to weed out some false positives
        if '<E T="03">1' not in tagged_text:
            possible = filter(lambda m: m.group(1) != '1', possible)
        collapsed_markers.extend(possible)
    return collapsed_markers
Пример #2
0
def collapsed_markers_matches(node_text, tagged_text):
    """Find collapsed markers, i.e. tree node paragraphs that begin within a
    single XML node, within this text. Remove citations and other false
    positives. This is pretty hacky right now -- it focuses on the plain
    text but takes cues from the tagged text. @todo: streamline logic"""
    # In addition to the regex above, keyterms are an acceptable prefix. We
    # therefore convert keyterms to satisfy the above regex
    node_for_keyterms = Node(node_text, node_type=Node.INTERP,
                             label=[get_first_interp_marker(node_text)])
    node_for_keyterms.tagged_text = tagged_text
    keyterm = KeyTerms.get_keyterm(node_for_keyterms)
    if keyterm:
        node_text = node_text.replace(keyterm, '.'*len(keyterm))

    collapsed_markers = []
    for marker in _first_markers:
        possible = ((m, m.start(), m.end())
                    for m in marker.finditer(node_text) if m.start() > 0)
        possible = remove_citation_overlaps(node_text, possible)
        # If certain characters follow, kill it
        for following in ("e.", ")", u"”", '"', "'"):
            possible = [(m, s, end) for m, s, end in possible
                        if not node_text[end:].startswith(following)]
        possible = [m for m, _, _ in possible]
        # As all "1." collapsed markers must be emphasized, run a quick
        # check to weed out some false positives
        if '<E T="03">1' not in tagged_text:
            possible = filter(lambda m: m.group(1) != '1', possible)
        collapsed_markers.extend(possible)
    return collapsed_markers
Пример #3
0
def collapsed_markers_matches(node_text, tagged_text):
    """Find collapsed markers, i.e. tree node paragraphs that begin within a
    single XML node, within this text. Remove citations and other false
    positives. This is pretty hacky right now -- it focuses on the plain
    text but takes cues from the tagged text. @todo: streamline logic"""
    # In addition to the regex above, keyterms are an acceptable prefix. We
    # therefore convert keyterms to satisfy the above regex
    node_for_keyterms = Node(
        node_text, node_type=Node.INTERP, tagged_text=tagged_text,
        label=[get_first_interp_marker(node_text)]
    )
    keyterm = KeyTerms.keyterm_in_node(node_for_keyterms)
    if keyterm:
        node_text = node_text.replace(keyterm, '.' * len(keyterm))

    collapsed_markers = []
    for marker in _first_markers:
        possible = [(m, m.start(), m.end())
                    for m in marker.finditer(node_text)]
        possible = remove_citation_overlaps(node_text, possible)
        possible = [triplet[0] for triplet in possible]
        collapsed_markers.extend(
            match for match in possible
            if not false_collapsed_marker(match, node_text, tagged_text)
        )
    return collapsed_markers
Пример #4
0
def collapsed_markers(text):
    """Not all paragraph markers are at the beginning of of the text. This
    grabs inner markers like (1) and (i) here:
    (c) cContent —(1) 1Content (i) iContent"""
    potential = [triplet for triplet in _collapsed_grammar.scanString(text)]
    #   remove any that overlap with citations
    potential = [trip for trip in remove_citation_overlaps(text, potential)]
    #   flatten the results
    potential = [pm for pms, _, _ in potential for pm in pms]
    #   remove any matches that aren't (a), (1), (i), etc. -- All other
    #   markers can't be collapsed
    first_markers = [level[0] for level in p_levels]
    potential = [pm for pm in potential if pm in first_markers]

    return potential
Пример #5
0
def collapsed_markers_matches(node_text):
    """Find collapsed markers, i.e. tree node paragraphs that begin within a
    single XML node, within this text. Remove citations and other false
    positives"""
    collapsed_markers = []
    for marker in _first_markers:
        possible = ((m, m.start(), m.end())
                    for m in marker.finditer(node_text) if m.start() > 0)
        possible = remove_citation_overlaps(node_text, possible)
        # If certain characters follow, kill it
        for following in ("e.", ")", u"”", '"', "'"):
            possible = [(m, s, end) for m, s, end in possible
                        if not node_text[end:].startswith(following)]
        collapsed_markers.extend(m for m, _, _ in possible)
    return collapsed_markers
Пример #6
0
def collapsed_markers(text):
    """Not all paragraph markers are at the beginning of of the text. This
    grabs inner markers like (1) and (i) here:
    (c) cContent —(1) 1Content (i) iContent"""
    potential = [triplet for triplet in _collapsed_grammar.scanString(text)]
    #   remove any that overlap with citations
    potential = [trip for trip in remove_citation_overlaps(text, potential)]
    #   flatten the results
    potential = [pm for pms, _, _ in potential for pm in pms]
    #   remove any matches that aren't (a), (1), (i), etc. -- All other
    #   markers can't be collapsed
    first_markers = [level[0] for level in p_levels]
    potential = [pm for pm in potential if pm in first_markers]

    return potential
Пример #7
0
def get_collapsed_markers(text):
    """Not all paragraph markers are at the beginning of of the text. This
    grabs inner markers like (1) and (i) here:
    (c) cContent —(1) 1Content (i) iContent"""

    matches = []
    for parser in _first_markers:
        matches.extend(parser.scanString(text))

    #   remove matches at the beginning
    if matches and matches[0][1] == 0:
        matches = matches[1:]

    #   remove any that overlap with citations
    matches = [m for m, _, _ in remove_citation_overlaps(text, matches)]

    #   get the letters; poor man's flatten
    return reduce(lambda lhs, rhs: list(lhs) + list(rhs), matches, [])
def get_collapsed_markers(text):
    """Not all paragraph markers are at the beginning of of the text. This
    grabs inner markers like (1) and (i) here:
    (c) cContent —(1) 1Content (i) iContent"""

    matches = []
    for parser in _first_markers:
        matches.extend(parser.scanString(text))

    #   remove matches at the beginning
    if matches and matches[0][1] == 0:
        matches = matches[1:]

    #   remove any that overlap with citations
    matches = [m for m, _, _ in remove_citation_overlaps(text, matches)]

    #   get the letters; poor man's flatten
    return reduce(lambda lhs, rhs: list(lhs) + list(rhs), matches, [])