def wash_and_repair_reference_line(line): """Wash a reference line of undesirable characters (such as poorly-encoded letters, etc), and repair any errors (such as broken URLs) if possible. @param line: (string) the reference line to be washed/repaired. @return: (string) the washed reference line. """ # repair URLs in line: line = repair_broken_urls(line) # Replace various undesirable characters with their alternatives: line = replace_undesirable_characters(line) # Replace "<title>," with "<title>", # common typing mistake line = re.sub(ur'"([^"]+),"', ur'"\g<1>",', line) line = replace_undesirable_characters(line) # Remove instances of multiple spaces from line, replacing with a # single space: line = re_multiple_space.sub(u' ', line) return line
def wash_and_repair_reference_line(line): """Wash a reference line of undesirable characters (such as poorly-encoded letters, etc), and repair any errors (such as broken URLs) if possible. @param line: (string) the reference line to be washed/repaired. @return: (string) the washed reference line. """ # repair URLs in line: line = repair_broken_urls(line) # Replace various undesirable characters with their alternatives: line = replace_undesirable_characters(line) # Replace "<title>," with "<title>", # common typing mistake line = re.sub(ur'"([^"]+),"', ur'"\g<1>",', line) line = replace_undesirable_characters(line) # Remove instances of multiple spaces from line, replacing with a # single space: line = re_multiple_space.sub(u" ", line) return line
def normalize_fulltext(fulltext): """Returns a 'cleaned' version of the output provided by pdftotext.""" # We recognize keywords by the spaces. We need these to match the # first and last words of the document. fulltext = " " + fulltext + " " # Replace some weird unicode characters. fulltext = replace_undesirable_characters(fulltext) # Replace the greek characters by their name. fulltext = _replace_greek_characters(fulltext) washing_regex = get_washing_regex() # Apply the regular expressions to the fulltext. for regex, replacement in washing_regex: fulltext = regex.sub(replacement, fulltext) return fulltext