/
TFIDF_regressors.py
executable file
·140 lines (95 loc) · 3.6 KB
/
TFIDF_regressors.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import os
import pandas as pd
import numpy as np
import spacy
from annotations import get_annotations_video
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import LinearSVR
from sklearn.model_selection import LeaveOneGroupOut
folder_Code = '/Users/Alex/Cours_Telecom/INFMDI 780/Code/filrouge/'
folder_Data = '/Users/Alex/Cours_Telecom/INFMDI 780/Data/'
folder_transcript = f'{folder_Data}/Texts_test_man/'
filename_annotations ='https://docs.google.com/\
spreadsheets/d/1Rqu1sJiD-ogc4a6R491JTiaYacptOTqh6DKqhwTa8NA/gviz/tq?tqx=out:csv&sheet=Template'
file_sw = 'french_stop_words.txt'
#Retrieve labels
df_annotations = pd.read_csv(filename_annotations, header=None).drop([0, 1, 2, 3])
video_names = set(df_annotations[1].values)
dict = {}
for (j,video) in enumerate(video_names):
for i in range(1,5):
text_file = f'{video}_{i}.txt'
label = get_annotations_video(filename_annotations, video,'max')[2]
gender = get_annotations_video(filename_annotations, video,'max')[4]
gender_bool = 1.0 if gender == 'H' else 0.0
group = j
dict[text_file] = (label[i-1],gender_bool, group)
df_labels = pd.DataFrame.from_dict(dict,columns=['Label','Gender','Group'],orient='index')
text_files = df_labels.index
#Retrieve transcripts
transcripts = []
os.chdir(folder_transcript)
for i in range(len(text_files)):
with open(text_files[i],'r') as file:
transcript = file.read()
transcripts.append(transcript)
#Lemmatization
nlp = spacy.load('fr_core_news_md')
transcript_lem_list = []
for trans in transcripts:
trans_lem = []
doc = nlp(trans)
for token in doc:
trans_lem.append(token.lemma_)
trans_join = " ".join(trans_lem)
transcript_lem_list.append(trans_join)
#Loading French stop words
os.chdir(folder_Code)
input_file = open(file_sw)
sw_list = []
for word in input_file:
sw_list.append(word[:-1])
#Vectorization of the transcripts
vectorizer = CountVectorizer(stop_words=sw_list)
X_vect = vectorizer.fit_transform(transcript_lem_list)
#TF IDF
tfidf = TfidfTransformer()
X_tfidf = tfidf.fit_transform(X_vect)
#Regressors
Ridge = Ridge(alpha=10)
RF = RandomForestRegressor(criterion='mse',max_features='sqrt',random_state=42)
SVR = LinearSVR(C=0.5)
#Leave-One-Interviewer-Out cross-validation
LOGO = LeaveOneGroupOut()
groups = df_labels.Group.values
X_cv = X_tfidf.toarray()
#np_gender = df_labels['Gender'].to_numpy().reshape(-1,1)
#X_cv_gender = np.concatenate((X_cv,np_gender),axis=1)
y_cv = df_labels['Label'].values
RMSE_list_Rid = []
RMSE_list_RF = []
RMSE_list_SVR = []
for train_index, test_index in LOGO.split(X_cv,y_cv,groups):
X_train, X_test = X_cv[train_index], X_cv[test_index]
y_train, y_test = y_cv[train_index], y_cv[test_index]
#Train fitting
Ridge.fit(X_train,y_train)
RF.fit(X_train,y_train)
SVR.fit(X_train,y_train)
#Test RMSE
y_pred_Rid = Ridge.predict(X_test)
RMSE_Rid = np.sqrt(np.mean((y_pred_Rid - y_test)**2))
y_pred_RF = RF.predict(X_test)
RMSE_RF= np.sqrt(np.mean((y_pred_RF - y_test)**2))
y_pred_SVR = SVR.predict(X_test)
RMSE_SVR = np.sqrt(np.mean((y_pred_SVR - y_test)**2))
RMSE_list_Rid.append(RMSE_Rid)
RMSE_list_RF.append(RMSE_RF)
RMSE_list_SVR.append(RMSE_SVR)
print(f'Ridge: {round(np.mean(RMSE_list_Rid),4)}')
print(f'RF: {round(np.mean(RMSE_list_RF),4)}')
print(f'SVR: {round(np.mean(RMSE_list_SVR),4)}')