Пример #1
0
 def test_tokenCleaner(self):
     sentence = "Hello I'm very happy 1313"
     goldenSentence = "hello"
     tokens = tokenize(sentence)
     functions.stopwords = load_stopwords("etc/stopwords_en.txt")
     newTokens = tokenCleaner(tokens, ["stemming", "toLowerCase", "removePunctuationAndNumbers", "stopwording"])
     self.assertEqual(sentenize(newTokens), goldenSentence)
Пример #2
0
 def test_processFileUnicode(self):
     rawObject = {
         "date": u"Sun Aug 07 01:28:32 IST 2011",
         "id": u"100000335933878272",
         "user_id": u"71610408",
         "status": u"Según @NWS_PTWC, no hay riesgo generalizado de #tsunami tras el #sismo de Japón http://t.co/icErcNfSCf",
     }
     goldenRawObject = {
         "date": u"Sun Aug 07 01:28:32 IST 2011",
         "id": u"100000335933878272",
         "user_id": u"71610408",
         "status": u"Según @NWS_PTWC, no hay riesgo generalizado de #tsunami tras el #sismo de Japón http://t.co/icErcNfSCf",
         "status_clean": u"Según hay riesgo generalizado tsunami tras sismo Japón",
     }
     rawObjects = [rawObject]
     text_field = "status"
     new_text_field = "status_clean"
     sentence_proc_list = {"removeUrl", "removeUserMention"}
     token_proc_list = {
         "stemming",
         "toLowerCase",
         "removePunctuationAndNumbers",
         "stopwording",
         "removeSingleChar",
         "removeDoubleChar",
     }
     functions.stopwords = load_stopwords("etc/stopwords_en.txt")
     proc = Processor(text_field, new_text_field, sentence_proc_list, token_proc_list)
     newRawObject = proc.processFile(rawObjects)
     self.assertEqual(rawObject, goldenRawObject)
Пример #3
0
 def test_processFile(self):
     rawObject = {
         "date": "Sun Aug 07 01:28:32 IST 2011",
         "id": "100000335933878272",
         "user_id": "71610408",
         "status": "@baloji you were so awesome, it was amazing and you were shining like the star that you are...MERCI!! #baloji i_i",
     }
     goldenRawObject = {
         "date": "Sun Aug 07 01:28:32 IST 2011",
         "id": "100000335933878272",
         "user_id": "71610408",
         "status": "@baloji you were so awesome, it was amazing and you were shining like the star that you are...MERCI!! #baloji i_i",
         "status_clean": "awesome amaze shin star merci baloji",
     }
     rawObjects = [rawObject]
     text_field = "status"
     new_text_field = "status_clean"
     sentence_proc_list = {"removeUrl", "removeUserMention"}
     token_proc_list = {
         "stemming",
         "toLowerCase",
         "removePunctuationAndNumbers",
         "stopwording",
         "removeSingleChar",
         "removeDoubleChar",
     }
     functions.stopwords = load_stopwords("etc/stopwords_en.txt")
     proc = Processor(text_field, new_text_field, sentence_proc_list, token_proc_list)
     newRawObject = proc.processFile(rawObjects)
     self.assertEqual(rawObject, goldenRawObject)
Пример #4
0
 def test_stopwording(self):
   sentence = "at eight not on thursday morning Arthur didn't feel very good"
   goldenSentence = "eight thursday morning Arthur n't feel good"
   language = 'english'
   stopwords = load_stopwords('etc/stopwords_en.txt')
   tokens = tokenize(sentence)
   newTokens = []
   for token in tokens:
     newTokens.append(stopwording(token) )
   self.assertEqual(sentenize(newTokens), goldenSentence)
Пример #5
0
 def test_cleanSentence(self):
     sentence = "At 8 o'clock on Thursday morning, the boys and girls didn't feel very good."
     sentenceProcList = ["removeUrl", "removeUserMention"]
     functions.stopwords = load_stopwords("etc/stopwords_en.txt")
     tokenProcList = [
         "stemming",
         "toLowerCase",
         "removePunctuationAndNumbers",
         "stopwording",
         "removeSingleChar",
         "removeDoubleChar",
     ]
     newSentence = cleanSentence(sentence, sentenceProcList, tokenProcList)
     goldSentence = "oclock thursday morning boy girl feel good"
     self.assertEqual(newSentence, goldSentence)
Пример #6
0
 def test_cleanSentenceUnicode(self):
     sentence = (
         u"Según @NWS_PTWC, no hay riesgo generalizado de #tsunami tras el #sismo de Japón http://t.co/icErcNfSCf"
     )
     sentenceProcList = ["removeUrl", "removeUserMention"]
     functions.stopwords = load_stopwords("etc/stopwords_en.txt")
     tokenProcList = [
         "stemming",
         "toLowerCase",
         "removePunctuationAndNumbers",
         "stopwording",
         "removeSingleChar",
         "removeDoubleChar",
     ]
     newSentence = cleanSentence(sentence, sentenceProcList, tokenProcList)
     goldSentence = u"según hay riesgo generalizado tsunami tras sismo japón"
     self.assertEqual(newSentence, goldSentence)
Пример #7
0
def cleaner(params, config):
  """ This is the core function. First, it sets the configuration file and the stopword file
      then read the input CSV file using a local buffer. The data read is processed line 
      by line and the result is stored in the outpufile.
  """
  #Set stopwords globally
  stopwords = load_stopwords(config.stopword_file_path)
  logger.debug("Configuration = " + str(config.toHash()))
  #Read data from input file
  p = BufferedParserXSV(config.fields, params.input_file, config.buffer_size, config.split_criteria_line)
  s = SerializerXSV(params.output_file, config.over_write_output_file, config.new_fields)
  proc = Processor(config.text_field, config.new_text_field, config.sentence_proc_list, config.token_proc_list)
  while True:
    raw_objects = p.nextObjects()
    if not raw_objects:
      break
    [new_objects, count_line, count_line_output] = proc.processFile(raw_objects)
    logger.info("Lines: Processed = " + str(count_line) + ", Produced = " + str(count_line_output) )
    s.pushObjects(new_objects)
  logger.info("Total lines: Processed = " + str(count_line) + ", Produced = " + str(count_line_output) )