class TestPreprocessor(unittest.TestCase): def setUp(self): self.pp = Preprocessor() def testPreporcessNull(self): #Null String should return none result = self.pp.clean('') self.assertEquals(result, None) def testPreporcessOneHiragana(self): #One hiragana should return None result = self.pp.clean(u'あw(^^)w') self.assertEquals(result, None) def testPreporcessZenkaku(self): #Zenkaku stuff should be converted and cleaned result = self.pp.clean(u'全角です123ww') self.assertEquals(result, u'全角です123') def testPreprocessNakano(self): #Test to see if the Preprocessing work as intended test_string = u'私の名前は中野ですwwww>あふぉ(^^)o' result = self.pp.clean(test_string) expected = u'私の名前は中野です' self.assertEquals(result, expected)
class MessageParser: my_name = re.compile('|'.join(MY_NAMES)) def __init__(self): self.cha = chasen self.pp = Preprocessor() self.cha.setformat('%m,%y,%M,%Y,%U(%P1),%BB|$|') def parseLine(self, message, clean=True): """ Parse a given line via chasen. If clean is True, does the preporcessing of message """ pMessage = '' if clean: pMessage = self.pp.clean(message) else: pMessage = message if pMessage: #print 'pmessage:%s' % pMessage pl = self.cha.sparse(pMessage.encode('euc-jp')).strip() return unicode(pl, 'euc-jp') else: print 'message is empty:' + message return None def parseSentence(self, message, clean=True): """ Return a list of pseudo words(=setence) from a string """ myNameFlag, message = self.checkForMe(message) words = self.parseLine(message, clean) if words: sentence = [] for word in words.split('|$|'): if word: keywords = [ 'appeared_name', 'appeared_reading', 'base_name', 'base_reading', 'main_type', 'sub_type' ] pseudo_word = generateDict(keywords, word.split(',')) sentence.append(pseudo_word) return self.doMergers(myNameFlag, sentence) else: return None def checkForMe(self, message): """Checks if my name is in the term """ name_flag = False is_name = self.my_name.search(message) if is_name: message = re.sub(self.my_name, u'私', message) name_flag = True return name_flag, message def merge(self, part, first, second=None): """Merge words of a part depending on it's lex class """ if len(part) == 1: return part new_part = [] merged_flag = False if not second: second = first for i, word in enumerate(part): if word['main_type'] == first: #Check to see if the nextword if i < len(part) - 1: if part[i + 1]['main_type'] == second: new_word = self.joinWords(word, part[i + 1]) part[i + 1] = new_word part.pop(i) return self.merge(part, first, second) #printWords(part) return part def joinWords(self, word1, word2): """Return a pseudo word joining the given 2""" new_word = {} new_word['appeared_name'] = ''.join( [word1['appeared_name'], word2['appeared_name']]) new_word['appeared_reading'] = ''.join( [word1['appeared_reading'], word2['appeared_reading']]) new_word['base_name'] = ''.join( [word1['base_name'], word2['base_name']]) new_word['base_reading'] = ''.join( [word1['base_reading'], word2['base_reading']]) new_word['main_type'] = word2['main_type'] new_word['sub_type'] = word2['sub_type'] return new_word def doMergers(self, myNameFlag, sentence): """Do some post processing functions on the sentence Required to workaround some of the chasen limitations """ jyoshi_idx = [ sentence.index(x) for x in sentence if x['main_type'] == u'助詞' ] #Append the last letter too jyoshi_idx.append(len(sentence) - 1) parts = [] w_part = [] for i, word in enumerate(sentence): if i in jyoshi_idx: parts.append(w_part) parts.append([word]) w_part = [] else: if myNameFlag == True and word['appeared_name'] == u'私': word['appeared_name'] = u'じゅな' word['appared_reading'] = u'じゅな' word['base_name'] = u'じゅな' word['base_reading'] = u'じゅな' w_part.append(word) # We need to iterate thro the parts, and merge nonus. # This is because a combo noun is quite different meaning wise # from a regular noun for p in parts: p = self.merge(p, u'名詞') p = self.merge(p, u'未知語') p = self.merge(p, u'記号') #Finally we merge them all to a new pseudo sentence n_sentence = [] for p in parts: for w in p: n_sentence.append(w) return n_sentence
from Preprocessor import Preprocessor from Vectorizer import Vectorizer from Classifier import Classifier from DeepLearning import DeepLearner from sklearn.model_selection import train_test_split as split import numpy as np dr = DataReader('./datasets/training-v1/offenseval-training-v1.tsv', 'A') data, labels = dr.get_labelled_data() data, labels = dr.shuffle(data, labels, 'random') data = data[:] labels = labels[:] prp = Preprocessor('remove_stopwords', 'lemmatize') data = prp.clean(data) tr_data, tst_data, tr_labels, tst_labels = split(np.array(data), labels, test_size=0.2, stratify=labels) tr_data, tr_labels = dr.upsample(tr_data, tr_labels, label=1) tr_data, tr_labels = dr.shuffle(tr_data, tr_labels, 'random') vct = Vectorizer('count') vct.vectorize(tr_data) model = DeepLearner(tr_data, tr_labels, vocab_length=vct.vocab_length, model_type='CNN')
sub_b=['UNT','TIN'] dr_tr = DataReader('./datasets/training-v1/offenseval-training-v1.tsv','B') tr_data,tr_labels = dr_tr.get_labelled_data() tr_data,tr_labels = dr_tr.upsample(tr_data,tr_labels,label=0) tr_data,tr_labels = dr_tr.shuffle(tr_data,tr_labels,'random') dr_tst = DataReader('./datasets/test-B/testset-taskb.tsv') tst_data,tst_ids = dr_tst.get_test_data() tr_data = tr_data[:500] tr_labels = tr_labels[:500] ##### Naive Bayes - Lemmatize - tfidf prp = Preprocessor('remove_stopwords') tr_data_clean = prp.clean(tr_data) tst_data_clean = prp.clean(tst_data) vct = Vectorizer('tfidf') tr_vectors = vct.vectorize(tr_data_clean) tst_vectors = vct.vectorize(tst_data_clean) clf = Classifier('M-NaiveBayes') tuned_accs = clf.tune(tr_vectors,tr_labels,{'alpha':[1,5,10],'fit_prior':[True,False]},best_only=False) print('NB Tuned:',tuned_accs) predictions = clf.predict(tst_vectors) with open('subtask-B-test-NB.csv','w') as f: for i,id in enumerate(tst_ids): f.write(str(id)+','+str(sub_b[predictions[i]])+'\n')
} ]] dr = DataReader('./datasets/training-v1/offenseval-training-v1.tsv', 'A') data, labels = dr.get_labelled_data() data, labels = dr.shuffle(data, labels, 'random') data, _, labels, _ = split(data, labels, test_size=0.5, stratify=labels) clf_dict = {clf[0]: {} for clf in clf_list} for vec in vec_list: print('Vectorization: ', vec[0]) for prp in prp_list: print('Preprocessing: ', prp) preprocessor = Preprocessor(prp) clean_data = preprocessor.clean(data) vectorizer = Vectorizer(type=vec[0], params=vec[1]) vectorized_data = vectorizer.vectorize(clean_data) for cl in clf_list: if vec[0] not in ['BoW', 'tfidf'] and cl[0] == 'M-NaiveBayes': continue print('Classifier: ', cl[0]) clf = Classifier(cl[0]) params_accs = clf.tune(vectorized_data, labels, cl[1], best_only=False) print('Scores:', params_accs) for key, value in params_accs.items():