from sklearn.pipeline import Pipeline, FeatureUnion from sklearn.svm import LinearSVC from sklearn.model_selection import KFold, cross_validate from joblib import dump, load import helperFunctions import transformers offenseval_train = './Data/train/offenseval-training-v1.tsv' offenseval_test = './Data/train/testset-taska.tsv' offense_words = './Resources/offensive_words_eng.txt' path_to_embs = './Resources/glove.twitter.27B.200d.txt' TASK = 'binary' #TASK = 'multi' IDsTrain, Xtrain, Ytrain = helperFunctions.read_corpus(offenseval_train) print('test data read') # Minimal preprocessing / cleaning Xtrain = helperFunctions.clean_samples(Xtrain) offensiveWords = helperFunctions.load_offense_words(offense_words) print("offensive words loaded") embeddings, vocab = helperFunctions.load_embeddings(path_to_embs) print("embeddings loaded") transformer = transformers.naiveOffensiveWordSimilarity( embeddings, offensiveWords) #lr = LogisticRegressionCV(max_iter=10000)
def runFitting(params, objects): TASK = 'binary' #TASK = 'multi' ''' Preparing data ''' featureList = [] if params["sentenceComplexityCheck"]: featureList.append("posTag") if params["embeddingsTermFreqFiltering"]: objects["freqFilter"].embeddingsEnabled = True if params["oneHotTermFreqFiltering"]: objects["freqFilter"].oneHotEnabled = True objects["liguisticFeatureExtractor"].setFeatureList(featureList) offenseval_train = './Data/train/offenseval-training-v1.tsv' #print('Reading in offenseval training data...') if TASK == 'binary': IDsTrain, Xtrain,Ytrain = helperFunctions.read_corpus(offenseval_train) else: IDsTrain,Xtrain,Ytrain = helperFunctions.read_corpus(offenseval_train, binary=False) Xtrain = helperFunctions.clean_samples(Xtrain) ''' Preparing vectorizer and classifier ''' # Vectorizing data / Extracting features #print('Preparing tools (vectorizer, classifier) ...') if params["tweetTokenization"]: count_word = transformers.CountVectorizer(ngram_range=(1,2), stop_words=stop_words.get_stop_words('en'), tokenizer=TweetTokenizer().tokenize) else: count_word = transformers.CountVectorizer(ngram_range=(1,2), stop_words=stop_words.get_stop_words('en')) count_char = transformers.CountVectorizer(analyzer='char', ngram_range=(3,7)) embedder = features.Embeddings(objects["embeddings"], pool='max') vectorizer = FeatureUnion([('word', count_word), ('char', count_char), ('word_embeds', embedder )]) if len(featureList) > 0: vectorizer.transformer_list.append(('lingFeats', objects["liguisticFeatureExtractor"])) if params["oneHotTermFreqFiltering"] or params["embeddingsTermFreqFiltering"]: vectorizer.transformer_list.append(('freqFilter', objects["freqFilter"])) if params["charNgramFreqFiltering"]: objects["charFreqFilter"].oneHotEnabled = True objects["charFreqFilter"].embeddingsEnabled = False vectorizer.transformer_list.append(('charfreqFilter', objects["charFreqFilter"])) if params["POStagCheck"]: vectorizer.transformer_list.append(('posTagger', transformers.posTagExtractor(Xtrain, Ytrain))) # Set up SVM classifier with unbalanced class weights """ if TASK == 'binary': # cl_weights_binary = None cl_weights_binary = {'OTHER':1, 'OFFENSE':10} clf = LinearSVC(class_weight=cl_weights_binary) else: # cl_weights_multi = None cl_weights_multi = {'OTHER':0.5, 'ABUSE':3, 'INSULT':3, 'PROFANITY':4} clf = LinearSVC(class_weight=cl_weights_multi) """ clf = LinearSVC() #scaler = StandardScaler(with_mean=False) classifier = Pipeline([ ('vectorize', vectorizer), #('scale', scaler), ('classify', clf)]) ''' Actual training and predicting: ''' print('Fitting on training data...') classifier.fit(Xtrain, Ytrain) print("storing") dump(classifier, "./Models/RUG_Offense_concatModel.joblib") ##print("cross validating") ### predicting on set aside training data #print('Predicting on set aside data...') #Yguess = classifier.predict(XcustomTest) print("cross validating") result = cross_validate(classifier, Xtrain, Ytrain,cv=10) print(result) ######## #print('Predicting...') #Yguess = classifier.predict(Xtest) """ ''' Outputting in format required ''' print('Outputting predictions...') outdir = '/Users/balinthompot/RUG/Honours/HateSpeech/offenseval-rug-master/Submission' fname = 'rug_fine_2.txt' with open(outdir + '/' + fname, 'w', encoding='utf-8') as fo: assert len(Yguess) == len(Xtest_raw), 'Unequal length between samples and predictions!' for idx in range(len(Yguess)): # print(Xtest_raw[idx] + '\t' + Yguess[idx] + '\t' + 'XXX', file=fo) # binary task (coarse) print(Xtest_raw[idx] + '\t' + 'XXX' + '\t' + Yguess[idx], file=fo) # multi task (fine) print('Done.') """ return classifier