Exemplo n.º 1
0
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.svm import LinearSVC
from sklearn.model_selection import KFold, cross_validate
from joblib import dump, load
import helperFunctions
import transformers

offenseval_train = './Data/train/offenseval-training-v1.tsv'
offenseval_test = './Data/train/testset-taska.tsv'
offense_words = './Resources/offensive_words_eng.txt'
path_to_embs = './Resources/glove.twitter.27B.200d.txt'

TASK = 'binary'
#TASK = 'multi'

IDsTrain, Xtrain, Ytrain = helperFunctions.read_corpus(offenseval_train)
print('test data read')

# Minimal preprocessing / cleaning

Xtrain = helperFunctions.clean_samples(Xtrain)
offensiveWords = helperFunctions.load_offense_words(offense_words)

print("offensive words loaded")
embeddings, vocab = helperFunctions.load_embeddings(path_to_embs)
print("embeddings loaded")

transformer = transformers.naiveOffensiveWordSimilarity(
    embeddings, offensiveWords)

#lr = LogisticRegressionCV(max_iter=10000)
Exemplo n.º 2
0
def runFitting(params, objects):

    TASK = 'binary'
    #TASK = 'multi'

    '''
    Preparing data
    '''

    featureList = []

    if params["sentenceComplexityCheck"]:
        featureList.append("posTag")
    if params["embeddingsTermFreqFiltering"]:
        objects["freqFilter"].embeddingsEnabled = True
    if params["oneHotTermFreqFiltering"]:
        objects["freqFilter"].oneHotEnabled = True



    objects["liguisticFeatureExtractor"].setFeatureList(featureList)
    offenseval_train = './Data/train/offenseval-training-v1.tsv'


    #print('Reading in offenseval training data...')
    if TASK == 'binary':
        IDsTrain, Xtrain,Ytrain = helperFunctions.read_corpus(offenseval_train)
    else:
        IDsTrain,Xtrain,Ytrain = helperFunctions.read_corpus(offenseval_train, binary=False)


    Xtrain = helperFunctions.clean_samples(Xtrain)


    '''
    Preparing vectorizer and classifier
    '''

    # Vectorizing data / Extracting features
    #print('Preparing tools (vectorizer, classifier) ...')
    if params["tweetTokenization"]:
        count_word = transformers.CountVectorizer(ngram_range=(1,2), stop_words=stop_words.get_stop_words('en'), tokenizer=TweetTokenizer().tokenize)
    else:
        count_word = transformers.CountVectorizer(ngram_range=(1,2), stop_words=stop_words.get_stop_words('en'))
    count_char = transformers.CountVectorizer(analyzer='char', ngram_range=(3,7))

    
    embedder = features.Embeddings(objects["embeddings"], pool='max')

    vectorizer = FeatureUnion([('word', count_word),
                                ('char', count_char),
                                ('word_embeds', embedder )])
    
    if len(featureList) > 0:
        vectorizer.transformer_list.append(('lingFeats', objects["liguisticFeatureExtractor"]))

    if params["oneHotTermFreqFiltering"] or params["embeddingsTermFreqFiltering"]:
        vectorizer.transformer_list.append(('freqFilter', objects["freqFilter"]))

    if params["charNgramFreqFiltering"]:
        objects["charFreqFilter"].oneHotEnabled = True
        objects["charFreqFilter"].embeddingsEnabled = False
        vectorizer.transformer_list.append(('charfreqFilter', objects["charFreqFilter"]))

    if params["POStagCheck"]:
        vectorizer.transformer_list.append(('posTagger', transformers.posTagExtractor(Xtrain, Ytrain)))

    # Set up SVM classifier with unbalanced class weights
    """     if TASK == 'binary':
        # cl_weights_binary = None
        cl_weights_binary = {'OTHER':1, 'OFFENSE':10}
        clf = LinearSVC(class_weight=cl_weights_binary)
    else:
        # cl_weights_multi = None
        cl_weights_multi = {'OTHER':0.5,
                            'ABUSE':3,
                            'INSULT':3,
                            'PROFANITY':4}
        clf = LinearSVC(class_weight=cl_weights_multi) """
    clf = LinearSVC()
    #scaler = StandardScaler(with_mean=False)

    classifier = Pipeline([
                            ('vectorize', vectorizer),
                            #('scale', scaler),
                            ('classify', clf)])


    


    '''
    Actual training and predicting:
    '''

    print('Fitting on training data...')
    classifier.fit(Xtrain, Ytrain)
    print("storing")
    dump(classifier, "./Models/RUG_Offense_concatModel.joblib")
    ##print("cross validating")
    ### predicting on set aside training data
    #print('Predicting on set aside data...')
    #Yguess = classifier.predict(XcustomTest)
    print("cross validating")
    result = cross_validate(classifier, Xtrain, Ytrain,cv=10)
    print(result)
    ########

    #print('Predicting...')
    #Yguess = classifier.predict(Xtest)


    """     '''
    Outputting in format required
    '''

    print('Outputting predictions...')

    outdir = '/Users/balinthompot/RUG/Honours/HateSpeech/offenseval-rug-master/Submission'
    fname = 'rug_fine_2.txt'

    with open(outdir + '/' + fname, 'w', encoding='utf-8') as fo:
        assert len(Yguess) == len(Xtest_raw), 'Unequal length between samples and predictions!'
        for idx in range(len(Yguess)):
            # print(Xtest_raw[idx] + '\t' + Yguess[idx] + '\t' + 'XXX', file=fo) # binary task (coarse)
            print(Xtest_raw[idx] + '\t' + 'XXX' + '\t' + Yguess[idx], file=fo) # multi task (fine)

    print('Done.')
    """
    return classifier