def test_remove_characters(self): assert string_process.remove_characters(u'太萌了><') == u'太萌了' assert string_process.remove_characters(u'!!!!!!!!!@uK3RXUYW3:') \ == u'@uK3RXUYW3' assert string_process.remove_characters(u'阿里吧吧啊,是', u',') \ == u'阿里吧吧啊是' assert string_process.remove_characters(u'阿 里吧吧啊?是', u'? ') \ == u'阿里吧吧啊是'
message = unicode(','.join(transactions[6:transactions_size-4]), encoding='utf-8', errors='ignore') raw_message[raw_mid] = message src_message[src_mid].append(message) # extract text from current user # remove @somebody text # remove symbols message = message[:message.find(u'//')].lower().strip() message = re.sub(u'@.*?$', u'', message.strip()) message = re.sub(u'@.*?:', u'', message.strip()) message = re.sub(u'[a-z]+$', u'', message.strip()) message = re.sub(u'^[a-z]+', u'', message.strip()) message = string_process.remove_characters(message.strip()) if len(message) == 0: continue csvwriter.writerow([raw_mid, src_mid, message.encode('utf-8')]) # test_num -= 1 # if test_num < 0: # break #for raw in raw_message.keys(): # print "-----------------------" # print "-----------------------" # print raw_message[raw].encode('utf-8') # for message in src_message[raw]: