def normalize_fulltext(fulltext): """Returns a 'cleaned' version of the output provided by pdftotext.""" # We recognize keywords by the spaces. We need these to match the # first and last words of the document. fulltext = " " + fulltext + " " # Replace some weird unicode characters. fulltext = replace_undesirable_characters(fulltext) # Replace the greek characters by their name. fulltext = _replace_greek_characters(fulltext) washing_regex = get_washing_regex() # Apply the regular expressions to the fulltext. for regex, replacement in washing_regex: fulltext = regex.sub(replacement, fulltext) return fulltext
def normalize_fulltext(fulltext): """Returns a 'cleaned' version of the output provided by pdftotext.""" # We recognize keywords by the spaces. We need these to match the # first and last words of the document. fulltext = " " + fulltext + " " # Replace some weird unicode characters. fulltext = replace_undesirable_characters(fulltext) # Replace the greek characters by their name. fulltext = _replace_greek_characters(fulltext) washing_regex = get_washing_regex() # Apply the regular expressions to the fulltext. for regex, replacement in washing_regex: fulltext = regex.sub(replacement, fulltext) return fulltext
def normalize_fulltext(fulltext): """Returns a 'cleaned' version of the output provided by pdftotext.""" # We recognize keywords by the spaces. We need these to match the # first and last words of the document. fulltext = " " + fulltext + " " # Replace some weird unicode characters. fulltext = replace_undesirable_characters(fulltext) # Replace the greek characters by their name. fulltext = _replace_greek_characters(fulltext) washing_regex = [ # Replace non and anti with non- and anti-. This allows a better # detection of keywords such as nonabelian. (re.compile(r"(\snon)[- ](\w+)"), r"\1\2"), (re.compile(r"(\santi)[- ](\w+)"), r"\1\2"), # Remove all leading numbers (e.g. 2-pion -> pion). (re.compile(r"\s\d-"), " "), # Remove multiple spaces. (re.compile(r" +"), " "), ] # Remove spaces in particle names. # Particles with -/+/* washing_regex += [ (re.compile(r"(%s) ([-+*])" % name), r"\1\2") for name in ("c", "muon", "s", "B", "D", "K", "Lambda", "Mu", "Omega", "Pi", "Sigma", "Tau", "W", "Xi") ] # Particles followed by numbers washing_regex += [ (re.compile(r"(%s) ([0-9]\W)" % name), r"\1\2") for name in ( "a", "b", "c", "f", "h", "s", "B", "D", "H", "K", "L", "Phi", "Pi", "Psi", "Rho", "Stor", "UA", "Xi", "Z", ) ] washing_regex += [ (re.compile(r"(\W%s) ?\( ?([0-9]+) ?\)[A-Z]?" % name), r"\1(\2)") for name in ("CP", "E", "G", "O", "S", "SL", "SO", "Spin", "SU", "U", "W", "Z") ] # Particles with ' washing_regex += [(re.compile(r"(\W%s) ('\W)" % name), r"\1\2") for name in ("Eta", "W", "Z")] # Particles with (N) washing_regex += [ (re.compile(r"(\W%s) ?\( ?N ?\)[A-Z]?" % name), r"\1(N)") for name in ("CP", "GL", "O", "SL", "SO", "Sp", "Spin", "SU", "U", "W", "Z") ] # All names followed by ([0-9]{3,4}) washing_regex.append((re.compile(r"([A-Za-z]) (\([0-9]{3,4}\)\+?)\s"), r"\1\2 ")) # Some weird names followed by ([0-9]{3,4}) washing_regex += [(re.compile(r"\(%s\) (\([0-9]{3,4}\))" % name), r"\1\2 ") for name in ("a0", "Ds1", "Ds2", "K\*")] washing_regex += [ # Remove all lonel operators (usually these are errors # introduced by pdftotext.) (re.compile(r" [+*] "), r" "), # Remove multiple spaces. (re.compile(r" +"), " "), # Remove multiple line breaks. (re.compile(r"\n+"), r"\n"), ] # Apply the regular expressions to the fulltext. for regex, replacement in washing_regex: fulltext = regex.sub(replacement, fulltext) return fulltext