from sklearn.linear_model import LogisticRegression stop = list(stopwords.words('english')) stop = list( set(stop) - { "no", "not", "nor" 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't", 'don', "don't", 'should', "should've" }) fname = "Data/AFDmovies/movies1000/" alltxts, alllabs = Loader.load_movies(fname) alltxts = np.array(alltxts) alllabs = np.array(alllabs) params = { # lowercase":[False,True], "punct": [False, True], # "marker":[False,True], # "number":[False,True], "stemming": [False, Preprocessing.stem_eng], #,Preprocessing.stem], "ligne": [None, -2, 0], # "strip_accents":[False,True], # "stopwords": [None, stop], # set(STOPWORDS)], "Vectorizer": [CountVectorizer, TfidfVectorizer], # "binary": [False,True], # "class_weight": [[0.1,1]],# ["balanced"],
if x == -1 or x == 0: f.write(str(classes[0])) else: f.write(str(classes[1])) f.write("\n") return res """ X_train,Y_train = Loader.load_pres(fname) X_test, _ = Loader.load_pres(tname) result = predict(X_train, Y_train, X_test, save = "auteurs.txt", classes = ["M","C"], post_processing=True) fig,ax = plt.subplots(figsize=(35,100)) ax.imshow(result.reshape(54,-1),interpolation="nearest") """ # plt.tight_layout() X_train, Y_train = Loader.load_movies(fname_2) X_test = Loader.load_movies_test(tname_2) result_sent = predict(X_train, Y_train, X_test, params=params_sentiments, save="sentiments.txt", classes=["-1", "1"], post_processing=False, equilibrage=False)
# -*- coding: utf-8 -*- from utils.utils import Loader from utils.preprocessing import Preprocessing from sklearn.feature_extraction.text import CountVectorizer import numpy as np from wordcloud import WordCloud import matplotlib.pyplot as plt from utils.oddsRatio import OddsRatioCloud from time import time from nltk.corpus import stopwords from utils.scoring import get_vectorizer fname = "Data/AFDmovies/movies1000/" train_x, train_y = Loader.load_movies(fname) stop = list(stopwords.words('english')) stop = list( set(stop) - { "no", "not", "nor" 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't", 'don', "don't", 'should', "should've" }) params = { "lowercase": False, "punct": False, # "marker":False,