Exemplo n.º 1
0
def normalize_fulltext(fulltext):
    """Returns a 'cleaned' version of the output provided by pdftotext."""
    # We recognize keywords by the spaces. We need these to match the
    # first and last words of the document.
    fulltext = " " + fulltext + " "

    # Replace some weird unicode characters.
    fulltext = replace_undesirable_characters(fulltext)
    # Replace the greek characters by their name.
    fulltext = _replace_greek_characters(fulltext)

    washing_regex = get_washing_regex()

    # Apply the regular expressions to the fulltext.
    for regex, replacement in washing_regex:
        fulltext = regex.sub(replacement, fulltext)

    return fulltext
def normalize_fulltext(fulltext):
    """Returns a 'cleaned' version of the output provided by pdftotext."""
    # We recognize keywords by the spaces. We need these to match the
    # first and last words of the document.
    fulltext = " " + fulltext + " "

    # Replace some weird unicode characters.
    fulltext = replace_undesirable_characters(fulltext)
    # Replace the greek characters by their name.
    fulltext = _replace_greek_characters(fulltext)

    washing_regex = get_washing_regex()

    # Apply the regular expressions to the fulltext.
    for regex, replacement in washing_regex:
        fulltext = regex.sub(replacement, fulltext)

    return fulltext
def normalize_fulltext(fulltext):
    """Returns a 'cleaned' version of the output provided by pdftotext."""
    # We recognize keywords by the spaces. We need these to match the
    # first and last words of the document.
    fulltext = " " + fulltext + " "

    # Replace some weird unicode characters.
    fulltext = replace_undesirable_characters(fulltext)
    # Replace the greek characters by their name.
    fulltext = _replace_greek_characters(fulltext)

    washing_regex = [
        # Replace non and anti with non- and anti-. This allows a better
        # detection of keywords such as nonabelian.
        (re.compile(r"(\snon)[- ](\w+)"), r"\1\2"),
        (re.compile(r"(\santi)[- ](\w+)"), r"\1\2"),
        # Remove all leading numbers (e.g. 2-pion -> pion).
        (re.compile(r"\s\d-"), " "),
        # Remove multiple spaces.
        (re.compile(r" +"), " "),
    ]

    # Remove spaces in particle names.
    # Particles with -/+/*
    washing_regex += [
        (re.compile(r"(%s) ([-+*])" % name), r"\1\2")
        for name in ("c", "muon", "s", "B", "D", "K", "Lambda", "Mu", "Omega", "Pi", "Sigma", "Tau", "W", "Xi")
    ]

    # Particles followed by numbers
    washing_regex += [
        (re.compile(r"(%s) ([0-9]\W)" % name), r"\1\2")
        for name in (
            "a",
            "b",
            "c",
            "f",
            "h",
            "s",
            "B",
            "D",
            "H",
            "K",
            "L",
            "Phi",
            "Pi",
            "Psi",
            "Rho",
            "Stor",
            "UA",
            "Xi",
            "Z",
        )
    ]
    washing_regex += [
        (re.compile(r"(\W%s) ?\( ?([0-9]+) ?\)[A-Z]?" % name), r"\1(\2)")
        for name in ("CP", "E", "G", "O", "S", "SL", "SO", "Spin", "SU", "U", "W", "Z")
    ]

    # Particles with '
    washing_regex += [(re.compile(r"(\W%s) ('\W)" % name), r"\1\2") for name in ("Eta", "W", "Z")]

    # Particles with (N)
    washing_regex += [
        (re.compile(r"(\W%s) ?\( ?N ?\)[A-Z]?" % name), r"\1(N)")
        for name in ("CP", "GL", "O", "SL", "SO", "Sp", "Spin", "SU", "U", "W", "Z")
    ]

    # All names followed by ([0-9]{3,4})
    washing_regex.append((re.compile(r"([A-Za-z]) (\([0-9]{3,4}\)\+?)\s"), r"\1\2 "))

    # Some weird names followed by ([0-9]{3,4})
    washing_regex += [(re.compile(r"\(%s\) (\([0-9]{3,4}\))" % name), r"\1\2 ") for name in ("a0", "Ds1", "Ds2", "K\*")]

    washing_regex += [
        # Remove all lonel operators (usually these are errors
        # introduced by pdftotext.)
        (re.compile(r" [+*] "), r" "),
        # Remove multiple spaces.
        (re.compile(r" +"), " "),
        # Remove multiple line breaks.
        (re.compile(r"\n+"), r"\n"),
    ]

    # Apply the regular expressions to the fulltext.
    for regex, replacement in washing_regex:
        fulltext = regex.sub(replacement, fulltext)

    return fulltext