def test_collapse_tabs(self): # Assert collapse multiple tabs to 1 space. for a, b in (("\t\t\t", ""), ("\t..\t", ".."), (".\t\t.", ". ."), (".\t\n", ".")): self.assertEqual(web.collapse_tabs(a), b) # Assert preserve indendation. self.assertEqual(web.collapse_tabs("\t\t .\t\n", indentation=True), "\t\t .") print "pattern.web.collapse_tabs()"
def cleanup(text, remove_punctuation=True): if remove_punctuation: text = re.sub('[^A-Za-z0-9\s\n]+', '', text) text = collapse_linebreaks(text, threshold=1).replace('\n', ' ') text = collapse_tabs(text, indentation=False, replace=' ') text = collapse_spaces(text, indentation=False, replace=' ') return text.strip()