def checkAndCleanText(self, inputText, rawData): """ Check and clean article text """ cleanedText = inputText invalidFlag = False try: for badString in self.invalidTextStrings: if cleanedText.find(badString) >= 0: logger.debug( "%s: Found invalid text strings in data extracted: %s", self.pluginName, badString) invalidFlag = True # check if article content is not valid or is too little if invalidFlag is True or len( cleanedText) < self.minArticleLengthInChars: cleanedText = self.extractArticleBody(rawData) # replace repeated spaces, tabs, hyphens, '\n', '\r\n', etc. cleanedText = filterRepeatedchars( cleanedText, deDupeList([' ', '\t', '\n', '\r\n', '-', '_', '.'])) cleanedText = cleanedText.replace('\n', ' ') # remove invalid substrings: for stringToFilter in deDupeList(self.subStringsToFilter): cleanedText = cleanedText.replace(stringToFilter, " ") except Exception as e: logger.error("Error cleaning text: %s", e) return (cleanedText)
def test_filterRepeatedchars(): # test to filter out Repeated charaters (parentFolder, sourceFolder, testdataFolder) = getAppFolders() sys.path.append(sourceFolder) import scraper_utils baseText = 'A good sentence with repeated spaces and tabs \t\t\t and\n\n\n newlines and hyphens---- dots....' charList = [' ', '\t', '\n', '-'] resultText = scraper_utils.filterRepeatedchars(baseText, charList) print('Result after filtering repeated characters:\n', resultText) assert resultText == "A good sentence with repeated spaces and tabs \t and\n newlines and hyphens- dots....",\ "10. filterRepeatedchars() is not filtering repeated characters correctly."
def checkAndCleanText(self, inputText, rawData): """ Check and clean article text """ cleanedText = inputText try: # ignore the newspaper extracted text, the alternate method text is more accurate: cleanedText = self.extractArticleBody(rawData) for badString in self.invalidTextStrings: if cleanedText.find(badString) >= 0: logger.debug("%s: Found invalid text strings in data extracted: %s", self.pluginName, badString) return(None) # replace repeated spaces, tabs, hyphens, '\n', '\r\n', etc. cleanedText = filterRepeatedchars(cleanedText, deDupeList([' ', '\t', '\n', '\r\n', '-', '_', '.'])) # remove invalid substrings: for stringToFilter in deDupeList(self.subStringsToFilter): cleanedText = cleanedText.replace(stringToFilter, " ") except Exception as e: logger.error("Error cleaning text: %s", e) return(cleanedText)