示例#1
0
    def load_file_without_labels(self,positif, negatif,wordVec):
		tab = []
		maxs = self.nbFeatures
		phrases = []
		y = []
		with codecs.open(positif,"r",encoding='latin-1') as my_file:
		    for line in my_file:
		        line= line.strip().lower() # remove the \n*
		        phrases.append(line)
		        y.append(1)
		        for mot in word_tokenize(line):
		            if(not mot in tab):
		                tab.append(mot)
		with codecs.open(negatif,"r",encoding='latin-1') as my_file:
		    for line in my_file:
		        line= line.strip().lower() # remove the \n*
		        phrases.append(line)
		        y.append(0)
		        for mot in word_tokenize(line):
		            if(not mot in tab):
		                tab.append(mot)
		print("longueur moyenne",np.mean([len(phrases[i].split()) for i in range(len(phrases))]))
                if(wordVec):
                    print("debnut generation")
                    self.geneator = word_generator()
                    pth = self.geneator.get_cluster_from_sentence(phrases,10)
                    tdd = []
                    N = 300
                    self.nbFeatures = N
                    for i in range(len(pth)):
                        if(len(pth[i])>N):
                            tdd.append(pth[i][:N])
                        else:
                            tdd.append(pth[i] + [0 for i in range(N-len(pth[i]))])
                    return (tdd,np.array(tab),np.array(y))
		else:
                    for i in range(len(phrases)):
		        mots = word_tokenize(phrases[i])
		        tmp  = []
		        for element in mots:
		            tmp.append(tab.index(element))
		        if(len(tmp) < maxs):
		            for j in range(maxs - len(tmp)):
		                tmp.append(0)
		        elif(len(tmp)>maxs):
		                tmp = tmp[:maxs]
		        phrases[i] = tmp
                print(len(phrases),len(phrases[0]))
                phrases = np.array(phrases)
                print("SHAPE",phrases.shape)
		return (phrases,np.array(tab),np.array(y))
 def __init__(self,nbTrain):
     self.nbTrain = nbTrain
     self.geneator = word_generator()
     self.load_file_without_labels("rt-polarity.pos","rt-polarity.neg")
     self.shuffle_datas()
     self.learn_SVM()