def train(self,train_target,train_samples): self._prepared = False; self.tfidf = TfidfVectorizer(stop_words='english') self.tfidf.fit_transform(train_samples[1]+train_samples[2]); #title and description #Classifier Model self.classifyers={}; classes=[]; if not self.keyword_detection_list : for each in train_target: classes.extend(x for x in each); classes = set(classes); else: classes = self.keyword_detection_list; print 'Total number of classes for this model ', len(classes) class_example_count = [] for each in classes: Y =[1 if each in x else 0 for x in train_target ]; class_example_count.append(sum(Y)); print 'examples seen for each class during training ' ,class_example_count self.bow = feature.feature(self.featurename,train_samples,keywords=self.keyword_detection_list); metric = []; #Classifier Model : Train for each in classes: #Balancing dataset target_y = [1 if each in x else 0 for x in train_target ]; [target_y_balanced, train_balanced]=load.split_equally(target_y,train_samples) #[target_y_balanced, train_balanced] = [target_y,train_samples] #print 'Not balancing test/train' print 'Training to tag %s from %d samples' %(each ,len(target_y_balanced)) Y =np.array(target_y_balanced); X = self.bow.get_incremental_features(train_balanced); assert(X.shape[0] == len(train_balanced)) assert(Y.shape[0] == len(train_balanced)) #if not LOGISTIC_REGRESSION: # clf = MultinomialNB(fit_prior=False);# onlu MultinomialNB takes sparse matrix , to offset hughe neg samples #else: clf = LogisticRegression(); clf.fit(X,Y); #pred = cross_validation.cross_val_predict(clf, X , Y, cv=3); self.classifyers[each] = clf; #eval.confused_examples(each,train_target,train_balanced,Y.tolist(),pred,3) #metric.append((each,prec,rec,acc,tp,tn,fp,fn)) self.train_target = train_target; x = [eachtraindoc[1] for eachtraindoc in train_samples] print 'tfidf ..' self.tfidfVec = self.tfidf.fit_transform(x); self.tfidfVec = self.tfidfVec.transpose(); print self.tfidfVec.shape self._prepared = True;
print 'examples seen for each class during training ', class_example_count classes = ['python'] #Feature Model if not LOGISTIC_REGRESSION: bow = feature.feature("bow", train_samples) else: bow_trimmed = feature.feature("bow_bigram", train_samples) metric = [] #Classifier Model : Train for each in classes: #Balancing dataset target_y = [1 if each in x else 0 for x in train_target] [target_y_balanced, train_balanced] = load.split_equally(target_y, train_samples) print 'Training to tag %s from %d samples' % (each, len(target_y_balanced)) Y = np.array(target_y_balanced) if not LOGISTIC_REGRESSION: X = bow.get_incremental_features(train_balanced) else: X = bow_trimmed.get_incremental_features(train_balanced) assert (X.shape[0] == len(train_balanced)) assert (Y.shape[0] == len(train_balanced)) if not LOGISTIC_REGRESSION: clf = MultinomialNB(fit_prior=False) # onlu MultinomialNB takes sparse matrix , to offset hughe neg samples else: clf = LogisticRegression() #clf.fit(X,Y);
#assert(sum(class_example_count) == len(train_target)) print 'examples seen for each class during training ' ,class_example_count classes=['python'] #Feature Model if not LOGISTIC_REGRESSION: bow = feature.feature("bow",train_samples); else: bow_trimmed = feature.feature("bow_bigram",train_samples); metric = []; #Classifier Model : Train for each in classes: #Balancing dataset target_y = [1 if each in x else 0 for x in train_target ]; [target_y_balanced, train_balanced]=load.split_equally(target_y,train_samples) print 'Training to tag %s from %d samples' %(each ,len(target_y_balanced)) Y =np.array(target_y_balanced); if not LOGISTIC_REGRESSION: X = bow.get_incremental_features(train_balanced); else: X = bow_trimmed.get_incremental_features(train_balanced); assert(X.shape[0] == len(train_balanced)) assert(Y.shape[0] == len(train_balanced)) if not LOGISTIC_REGRESSION: clf = MultinomialNB(fit_prior=False);# onlu MultinomialNB takes sparse matrix , to offset hughe neg samples else: clf = LogisticRegression(); #clf.fit(X,Y); pred = cross_validation.cross_val_predict(clf, X , Y, cv=3);