예제 #1
0
 def checkAndCleanText(self, inputText, rawData):
     """ Check and clean article text
     """
     cleanedText = inputText
     invalidFlag = False
     try:
         for badString in self.invalidTextStrings:
             if cleanedText.find(badString) >= 0:
                 logger.debug(
                     "%s: Found invalid text strings in data extracted: %s",
                     self.pluginName, badString)
                 invalidFlag = True
         # check if article content is not valid or is too little
         if invalidFlag is True or len(
                 cleanedText) < self.minArticleLengthInChars:
             cleanedText = self.extractArticleBody(rawData)
         # replace repeated spaces, tabs, hyphens, '\n', '\r\n', etc.
         cleanedText = filterRepeatedchars(
             cleanedText,
             deDupeList([' ', '\t', '\n', '\r\n', '-', '_', '.']))
         cleanedText = cleanedText.replace('\n', ' ')
         # remove invalid substrings:
         for stringToFilter in deDupeList(self.subStringsToFilter):
             cleanedText = cleanedText.replace(stringToFilter, " ")
     except Exception as e:
         logger.error("Error cleaning text: %s", e)
     return (cleanedText)
예제 #2
0
def test_filterRepeatedchars():
    # test to filter out Repeated charaters
    (parentFolder, sourceFolder, testdataFolder) = getAppFolders()
    sys.path.append(sourceFolder)
    import scraper_utils
    baseText = 'A good sentence with repeated    spaces and tabs \t\t\t and\n\n\n newlines and hyphens---- dots....'
    charList = [' ', '\t', '\n', '-']
    resultText = scraper_utils.filterRepeatedchars(baseText, charList)
    print('Result after filtering repeated characters:\n', resultText)
    assert resultText == "A good sentence with repeated spaces and tabs \t and\n newlines and hyphens- dots....",\
        "10. filterRepeatedchars() is not filtering repeated characters correctly."
예제 #3
0
 def checkAndCleanText(self, inputText, rawData):
     """ Check and clean article text
     """
     cleanedText = inputText
     try:
         # ignore the newspaper extracted text, the alternate method text is more accurate:
         cleanedText = self.extractArticleBody(rawData)
         for badString in self.invalidTextStrings:
             if cleanedText.find(badString) >= 0:
                 logger.debug("%s: Found invalid text strings in data extracted: %s", self.pluginName, badString)
                 return(None)
         # replace repeated spaces, tabs, hyphens, '\n', '\r\n', etc.
         cleanedText = filterRepeatedchars(cleanedText,
                                           deDupeList([' ', '\t', '\n', '\r\n', '-', '_', '.']))
         # remove invalid substrings:
         for stringToFilter in deDupeList(self.subStringsToFilter):
             cleanedText = cleanedText.replace(stringToFilter, " ")
     except Exception as e:
         logger.error("Error cleaning text: %s", e)
     return(cleanedText)