def test_tokenCleaner(self): sentence = "Hello I'm very happy 1313" goldenSentence = "hello" tokens = tokenize(sentence) functions.stopwords = load_stopwords("etc/stopwords_en.txt") newTokens = tokenCleaner(tokens, ["stemming", "toLowerCase", "removePunctuationAndNumbers", "stopwording"]) self.assertEqual(sentenize(newTokens), goldenSentence)
def test_processFileUnicode(self): rawObject = { "date": u"Sun Aug 07 01:28:32 IST 2011", "id": u"100000335933878272", "user_id": u"71610408", "status": u"Según @NWS_PTWC, no hay riesgo generalizado de #tsunami tras el #sismo de Japón http://t.co/icErcNfSCf", } goldenRawObject = { "date": u"Sun Aug 07 01:28:32 IST 2011", "id": u"100000335933878272", "user_id": u"71610408", "status": u"Según @NWS_PTWC, no hay riesgo generalizado de #tsunami tras el #sismo de Japón http://t.co/icErcNfSCf", "status_clean": u"Según hay riesgo generalizado tsunami tras sismo Japón", } rawObjects = [rawObject] text_field = "status" new_text_field = "status_clean" sentence_proc_list = {"removeUrl", "removeUserMention"} token_proc_list = { "stemming", "toLowerCase", "removePunctuationAndNumbers", "stopwording", "removeSingleChar", "removeDoubleChar", } functions.stopwords = load_stopwords("etc/stopwords_en.txt") proc = Processor(text_field, new_text_field, sentence_proc_list, token_proc_list) newRawObject = proc.processFile(rawObjects) self.assertEqual(rawObject, goldenRawObject)
def test_processFile(self): rawObject = { "date": "Sun Aug 07 01:28:32 IST 2011", "id": "100000335933878272", "user_id": "71610408", "status": "@baloji you were so awesome, it was amazing and you were shining like the star that you are...MERCI!! #baloji i_i", } goldenRawObject = { "date": "Sun Aug 07 01:28:32 IST 2011", "id": "100000335933878272", "user_id": "71610408", "status": "@baloji you were so awesome, it was amazing and you were shining like the star that you are...MERCI!! #baloji i_i", "status_clean": "awesome amaze shin star merci baloji", } rawObjects = [rawObject] text_field = "status" new_text_field = "status_clean" sentence_proc_list = {"removeUrl", "removeUserMention"} token_proc_list = { "stemming", "toLowerCase", "removePunctuationAndNumbers", "stopwording", "removeSingleChar", "removeDoubleChar", } functions.stopwords = load_stopwords("etc/stopwords_en.txt") proc = Processor(text_field, new_text_field, sentence_proc_list, token_proc_list) newRawObject = proc.processFile(rawObjects) self.assertEqual(rawObject, goldenRawObject)
def test_stopwording(self): sentence = "at eight not on thursday morning Arthur didn't feel very good" goldenSentence = "eight thursday morning Arthur n't feel good" language = 'english' stopwords = load_stopwords('etc/stopwords_en.txt') tokens = tokenize(sentence) newTokens = [] for token in tokens: newTokens.append(stopwording(token) ) self.assertEqual(sentenize(newTokens), goldenSentence)
def test_cleanSentence(self): sentence = "At 8 o'clock on Thursday morning, the boys and girls didn't feel very good." sentenceProcList = ["removeUrl", "removeUserMention"] functions.stopwords = load_stopwords("etc/stopwords_en.txt") tokenProcList = [ "stemming", "toLowerCase", "removePunctuationAndNumbers", "stopwording", "removeSingleChar", "removeDoubleChar", ] newSentence = cleanSentence(sentence, sentenceProcList, tokenProcList) goldSentence = "oclock thursday morning boy girl feel good" self.assertEqual(newSentence, goldSentence)
def test_cleanSentenceUnicode(self): sentence = ( u"Según @NWS_PTWC, no hay riesgo generalizado de #tsunami tras el #sismo de Japón http://t.co/icErcNfSCf" ) sentenceProcList = ["removeUrl", "removeUserMention"] functions.stopwords = load_stopwords("etc/stopwords_en.txt") tokenProcList = [ "stemming", "toLowerCase", "removePunctuationAndNumbers", "stopwording", "removeSingleChar", "removeDoubleChar", ] newSentence = cleanSentence(sentence, sentenceProcList, tokenProcList) goldSentence = u"según hay riesgo generalizado tsunami tras sismo japón" self.assertEqual(newSentence, goldSentence)
def cleaner(params, config): """ This is the core function. First, it sets the configuration file and the stopword file then read the input CSV file using a local buffer. The data read is processed line by line and the result is stored in the outpufile. """ #Set stopwords globally stopwords = load_stopwords(config.stopword_file_path) logger.debug("Configuration = " + str(config.toHash())) #Read data from input file p = BufferedParserXSV(config.fields, params.input_file, config.buffer_size, config.split_criteria_line) s = SerializerXSV(params.output_file, config.over_write_output_file, config.new_fields) proc = Processor(config.text_field, config.new_text_field, config.sentence_proc_list, config.token_proc_list) while True: raw_objects = p.nextObjects() if not raw_objects: break [new_objects, count_line, count_line_output] = proc.processFile(raw_objects) logger.info("Lines: Processed = " + str(count_line) + ", Produced = " + str(count_line_output) ) s.pushObjects(new_objects) logger.info("Total lines: Processed = " + str(count_line) + ", Produced = " + str(count_line_output) )