def train(labeled_featuresets, C=1e5): """ :param labeled_featuresets: A list of classified featuresets, i.e., a list of tuples ``(featureset, label)``. """ feat = [featureset for featureset, label in labeled_featuresets] feature_vectorizer = MVectorizer.DictsVectorizer() X = feature_vectorizer.fit_transform(feat) X = Normalizer().fit_transform(X) label_set = set( [label for featureset, label in labeled_featuresets] ) label_vectorizer = dict( [(label,num) for num,label in enumerate(label_set)] ) y = numpy.array([label_vectorizer[label] for featureset, label in labeled_featuresets]) print "Training on %d examples with %d features..."%(X.shape[0],X.shape[1]), classifier = OneVsRestClassifier(LinearSVC(loss='l2', penalty='l2', dual=True, tol=1e-5, C=C, scale_C=True)) classifier.fit(X,y) print "done" return scikit_classifier(feature_vectorizer,label_vectorizer,classifier)
def predict(self, egs): # Given a list of examples, predict their word senses res = [] if (self.use_syntactic_features): word_list = Syntactic_features.prepare_file(self.test_file) syntactic = Syntactic_features.parse_stanford_output(self.test_file, word_list) syn_index = 0 for eg in egs: eg.word = eg.word.lower() data,labels,pos,lesky,lesky_words = self.prepare_examples([eg], for_training=False) # Add context words X = self.vectorizers[eg.word].transform(data[eg.word]) # Add Parts of Speech X_pos = self.pos_vectorizers[eg.word].transform(pos[eg.word]) X = sps.hstack((X, X_pos)) # Add Lesky Words if self.use_lesk_words: X_leskywords = self.lesky_words_vectorizers[eg.word].transform(lesky_words[eg.word]) X = sps.hstack((X, X_leskywords)) # Add Lesky if self.use_lesk: X_lesk = MVectorizer.rectangularize(lesky[eg.word]) X = sps.hstack((X, X_lesk)) # Add Syntactic dependencies if (self.use_syntactic_features): if all(synfeat == [] for synfeat in syntactic[syn_index]): pass elif (not (eg.word in self.syn_vectorizers)): pass else: X_syn = self.syn_vectorizers[eg.word].transform([syntactic[syn_index]]) (x_rows,x_cols) = X.shape (xsyn_rows,xsyn_cols) = X_syn.shape if x_rows != xsyn_rows: X_filler = sps.coo_matrix((x_rows-xsyn_rows,xsyn_cols)) X_syn = sps.vstack((X_syn,X_filler)) X = sps.hstack((X, X_syn)) syn_index += 1 # Add NGram model if self.ngram_size > 0: num_senses = self.nsenses[eg.word] assert num_senses == len(eg.senses) ngram_list = [] for sentence in data[eg.word]: ngram_list.append( dict([ ( idx, self.ngram[eg.word+str(idx)].get_perplexity(sentence,True) ) for idx in range(0,num_senses) ]) ) X_ngram = MVectorizer.DictsVectorizer().fit_transform(ngram_list) X = sps.hstack((X, X_ngram)) Y = self.classifiers[eg.word].predict(X) senses = [0]*len(eg.senses) for y in list(Y[0]): senses[y] = 1 res.extend(senses) return res
def train(self,egs): # Trains a classifier for each word sense data,labels,pos,lesky,lesky_words,ngram,nsenses,syntactic = self.prepare_examples(egs,verbose=True) self.ngram = ngram self.nsenses = nsenses print "\nTraining on %d words"%len(data), for word in labels.iterkeys(): sys.stdout.write(".") sys.stdout.flush() # Extract context features self.vectorizers[word] = Vectorizer() X = self.vectorizers[word].fit_transform(data[word]) # Add Parts of Speech self.pos_vectorizers[word] = Vectorizer() X_pos = self.pos_vectorizers[word].fit_transform(pos[word]) X = sps.hstack((X, X_pos)) # Add Lesky Words if self.use_lesk_words: self.lesky_words_vectorizers[word] = Vectorizer() X_leskwords = self.lesky_words_vectorizers[word].fit_transform(lesky_words[word]) X = sps.hstack((X, X_leskwords)) # Add Lesky if self.use_lesk: X_lesk = MVectorizer.rectangularize(lesky[word]) X = sps.hstack((X, X_lesk)) # Add Syntactic dependencies if (self.use_syntactic_features): if all(synfeat == [] for synfeat in syntactic[word]): pass else: self.syn_vectorizers[word] = MVectorizer.ListsVectorizer() X_syn = self.syn_vectorizers[word].fit_transform(syntactic[word]) (x_rows,x_cols) = X.shape (xsyn_rows,xsyn_cols) = X_syn.shape if x_rows != xsyn_rows: X_filler = sps.coo_matrix((x_rows-xsyn_rows,xsyn_cols)) X_syn = sps.vstack((X_syn,X_filler)) X = sps.hstack((X, X_syn)) # Add NGram model if self.ngram_size > 0: num_senses = self.nsenses[word] ngram_list = [] for sentence in data[word]: ngram_list.append( dict([ ( idx, self.ngram[word+str(idx)].get_perplexity(sentence,True) ) for idx in range(0,num_senses) ]) ) X_ngram = MVectorizer.DictsVectorizer().fit_transform(ngram_list) X = sps.hstack((X, X_ngram)) Y = labels[word] # Learn classifier # self.classifiers[word] = OneVsRestClassifier(SVC(kernel='linear',scale_C=True)) #Doesn't work self.classifiers[word] = OneVsRestClassifier(LinearSVC(loss='l2', penalty='l2', dual=False, tol=1e-3)) self.classifiers[word].fit(X,Y) print "\nDone"