def load_file_without_labels(self,positif, negatif,wordVec): tab = [] maxs = self.nbFeatures phrases = [] y = [] with codecs.open(positif,"r",encoding='latin-1') as my_file: for line in my_file: line= line.strip().lower() # remove the \n* phrases.append(line) y.append(1) for mot in word_tokenize(line): if(not mot in tab): tab.append(mot) with codecs.open(negatif,"r",encoding='latin-1') as my_file: for line in my_file: line= line.strip().lower() # remove the \n* phrases.append(line) y.append(0) for mot in word_tokenize(line): if(not mot in tab): tab.append(mot) print("longueur moyenne",np.mean([len(phrases[i].split()) for i in range(len(phrases))])) if(wordVec): print("debnut generation") self.geneator = word_generator() pth = self.geneator.get_cluster_from_sentence(phrases,10) tdd = [] N = 300 self.nbFeatures = N for i in range(len(pth)): if(len(pth[i])>N): tdd.append(pth[i][:N]) else: tdd.append(pth[i] + [0 for i in range(N-len(pth[i]))]) return (tdd,np.array(tab),np.array(y)) else: for i in range(len(phrases)): mots = word_tokenize(phrases[i]) tmp = [] for element in mots: tmp.append(tab.index(element)) if(len(tmp) < maxs): for j in range(maxs - len(tmp)): tmp.append(0) elif(len(tmp)>maxs): tmp = tmp[:maxs] phrases[i] = tmp print(len(phrases),len(phrases[0])) phrases = np.array(phrases) print("SHAPE",phrases.shape) return (phrases,np.array(tab),np.array(y))
def __init__(self,nbTrain): self.nbTrain = nbTrain self.geneator = word_generator() self.load_file_without_labels("rt-polarity.pos","rt-polarity.neg") self.shuffle_datas() self.learn_SVM()