if x == -1 or x == 0: f.write(str(classes[0])) else: f.write(str(classes[1])) f.write("\n") return res """ X_train,Y_train = Loader.load_pres(fname) X_test, _ = Loader.load_pres(tname) result = predict(X_train, Y_train, X_test, save = "auteurs.txt", classes = ["M","C"], post_processing=True) fig,ax = plt.subplots(figsize=(35,100)) ax.imshow(result.reshape(54,-1),interpolation="nearest") """ # plt.tight_layout() X_train, Y_train = Loader.load_movies(fname_2) X_test = Loader.load_movies_test(tname_2) result_sent = predict(X_train, Y_train, X_test, params=params_sentiments, save="sentiments.txt", classes=["-1", "1"], post_processing=False, equilibrage=False)
from sklearn.feature_extraction.text import CountVectorizer from sklearn.model_selection import train_test_split from sklearn import linear_model as lin from sklearn import svm import sklearn.naive_bayes as nb from wordcloud import WordCloud from nltk.corpus import stopwords import matplotlib.pyplot as plt from time import time import spacy import numpy as np import pickle fname = "Data/AFDpresidentutf8/corpus.tache1.learn.utf8" alltxts, alllabs = Loader.load_pres(fname) params = { "lowercase": [False, True], "punct": [False, True], "marker": [False, True], "number": [False, True], "stemming": [False, Preprocessing.stem], "ligne": [None, -2, 0], "strip_accents": [False, True], "stopwords": [None, stop], # set(STOPWORDS)], "Vectorizer": [CountVectorizer, TfidfVectorizer], "binary": [True, False], "class_weight": ["balanced", None], "max_features": [None, 10000, 7000], "ngram_range": [(1, 1), (1, 2)],
# -*- coding: utf-8 -*- from utils.utils import Loader from utils.preprocessing import Preprocessing from sklearn.feature_extraction.text import CountVectorizer import numpy as np from wordcloud import WordCloud import matplotlib.pyplot as plt from utils.oddsRatio import OddsRatioCloud from time import time import spacy from nltk.corpus import stopwords fname = "Data/AFDpresidentutf8/corpus.tache1.learn.utf8" train_x,train_y = Loader.load_pres(fname) stop = list(stopwords.words('french')) # + ['cet', 'cette', 'là'] params = { "lowercase":False, "punct":False, "marker":False, "number":False, "stemming": Preprocessing.lem, # Preprocessing.stem, "ligne": None, "strip_accents":False, "stopwords": stop # set(stop) } f = lambda x: Preprocessing.preprocessing(x,params) #%% vectorizer = CountVectorizer(preprocessor = f,lowercase=False,token_pattern = Preprocessing.token_pattern)
from utils.utils import Loader fname = "Data/AFDpresidentutf8/corpus.tache1.learn.utf8" alltxts,alllabs = Loader.load_pres(fname) fname = "Data/AFDpresidentutf8/corpus.tache1.test.utf8" alltxts_test,alllabs_test = Loader.load_pres(fname) ''' print(len(alltxts),len(alllabs)) print(alltxts[0]) print(alllabs[0]) print(alltxts[-1]) print(alllabs[-1]) path = "Data/AFDmovies/movies1000/" alltxts,alllabs = Loader.load_movies(path) '''
from sklearn.linear_model import LogisticRegression stop = list(stopwords.words('english')) stop = list( set(stop) - { "no", "not", "nor" 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't", 'don', "don't", 'should', "should've" }) fname = "Data/AFDmovies/movies1000/" alltxts, alllabs = Loader.load_movies(fname) alltxts = np.array(alltxts) alllabs = np.array(alllabs) params = { # lowercase":[False,True], "punct": [False, True], # "marker":[False,True], # "number":[False,True], "stemming": [False, Preprocessing.stem_eng], #,Preprocessing.stem], "ligne": [None, -2, 0], # "strip_accents":[False,True], # "stopwords": [None, stop], # set(STOPWORDS)], "Vectorizer": [CountVectorizer, TfidfVectorizer], # "binary": [False,True], # "class_weight": [[0.1,1]],# ["balanced"],
# -*- coding: utf-8 -*- from utils.utils import Loader from utils.preprocessing import Preprocessing from sklearn.feature_extraction.text import CountVectorizer import numpy as np from wordcloud import WordCloud import matplotlib.pyplot as plt from utils.oddsRatio import OddsRatioCloud from time import time from nltk.corpus import stopwords from utils.scoring import get_vectorizer fname = "Data/AFDmovies/movies1000/" train_x, train_y = Loader.load_movies(fname) stop = list(stopwords.words('english')) stop = list( set(stop) - { "no", "not", "nor" 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't", 'don', "don't", 'should', "should've" }) params = { "lowercase": False, "punct": False, # "marker":False,