def linesfilter(postName, commentName, postfix): fpost = open(postName) fcomment = open(commentName) postLine = fpost.readlines() commentLine = fcomment.readlines() #can remove some lines which is too long.. assert (len(postLine) == len(commentLine)) postTextLines = _getTextList(postLine) commentTextLines = _getTextList(commentLine) postLineWrite = [] commentLineWrite = [] maxlen = 200 for i in xrange(len(postTextLines)): if len(postTextLines[i]) > maxlen or len(commentTextLines[i]) > maxlen : print postTextLines[i], len(postTextLines[i]) print 'too long ', i continue text = postTextLines[i] templine = Filter.urlFilter(text) templine = Filter.spaceFilter(templine) postLineWrite.append(templine + '\n') text = commentTextLines[i] templine = Filter.urlFilter(text) templine = Filter.spaceFilter(templine) commentLineWrite.append(templine + '\n') filteredPostName = postName + postfix filteredCommentName = commentName + postfix print filteredPostName fpost = open(filteredPostName, 'w') fcomment = open(filteredCommentName, 'w') fpost.writelines(postLineWrite) fcomment.writelines(commentLineWrite)
def testSpaceFilter(contents): filted = Filter.spaceFilter(text) return filted