Exemplo n.º 1
0
	def train(self,train_target,train_samples):
		self._prepared = False;
		self.tfidf = TfidfVectorizer(stop_words='english')		
		self.tfidf.fit_transform(train_samples[1]+train_samples[2]); #title and description
		#Classifier Model
		self.classifyers={};
		classes=[];
		if not self.keyword_detection_list :
			for each in train_target: classes.extend(x for x in each);		
			classes = set(classes);
		else:
			classes = self.keyword_detection_list;

		print 'Total number of classes for this model ', len(classes)
		class_example_count = []
		for each in classes:
		  Y =[1 if each in x  else 0 for x in train_target ];
		  class_example_count.append(sum(Y));		
		print 'examples seen for each class during training ' ,class_example_count

		self.bow = feature.feature(self.featurename,train_samples,keywords=self.keyword_detection_list);
		metric = []; 
		#Classifier Model : Train
		for each in classes:  
			#Balancing dataset
			target_y = [1 if each in x  else 0 for x in train_target ];
			[target_y_balanced, train_balanced]=load.split_equally(target_y,train_samples)
			#[target_y_balanced, train_balanced] = [target_y,train_samples]
			#print 'Not balancing test/train'
			print 'Training to tag %s from %d samples' %(each ,len(target_y_balanced))
			Y =np.array(target_y_balanced);

			X = self.bow.get_incremental_features(train_balanced);
			assert(X.shape[0] ==  len(train_balanced))
			assert(Y.shape[0] == len(train_balanced))

			#if not LOGISTIC_REGRESSION:
			#	clf = MultinomialNB(fit_prior=False);# onlu MultinomialNB takes sparse matrix , to offset hughe neg samples
			#else:
			clf = LogisticRegression();

			clf.fit(X,Y);
			#pred = cross_validation.cross_val_predict(clf, X , Y, cv=3);
			self.classifyers[each] = clf;  
			#eval.confused_examples(each,train_target,train_balanced,Y.tolist(),pred,3)
			#metric.append((each,prec,rec,acc,tp,tn,fp,fn))
		self.train_target = train_target;
		x = [eachtraindoc[1] for eachtraindoc in train_samples]
		print 'tfidf ..'
		self.tfidfVec = self.tfidf.fit_transform(x);
		self.tfidfVec = self.tfidfVec.transpose();
		print self.tfidfVec.shape
		self._prepared = True;
Exemplo n.º 2
0
print 'examples seen for each class during training ', class_example_count

classes = ['python']
#Feature Model
if not LOGISTIC_REGRESSION:
    bow = feature.feature("bow", train_samples)
else:
    bow_trimmed = feature.feature("bow_bigram", train_samples)

metric = []
#Classifier Model : Train
for each in classes:
    #Balancing dataset
    target_y = [1 if each in x else 0 for x in train_target]
    [target_y_balanced,
     train_balanced] = load.split_equally(target_y, train_samples)
    print 'Training to tag %s from %d samples' % (each, len(target_y_balanced))
    Y = np.array(target_y_balanced)

    if not LOGISTIC_REGRESSION:
        X = bow.get_incremental_features(train_balanced)
    else:
        X = bow_trimmed.get_incremental_features(train_balanced)
    assert (X.shape[0] == len(train_balanced))
    assert (Y.shape[0] == len(train_balanced))
    if not LOGISTIC_REGRESSION:
        clf = MultinomialNB(fit_prior=False)
        # onlu MultinomialNB takes sparse matrix , to offset hughe neg samples
    else:
        clf = LogisticRegression()
    #clf.fit(X,Y);
Exemplo n.º 3
0
#assert(sum(class_example_count) == len(train_target))
print 'examples seen for each class during training ' ,class_example_count

classes=['python']
#Feature Model
if not LOGISTIC_REGRESSION:
    bow = feature.feature("bow",train_samples);
else:
    bow_trimmed = feature.feature("bow_bigram",train_samples);

metric = []; 
#Classifier Model : Train
for each in classes:  
  #Balancing dataset
  target_y = [1 if each in x  else 0 for x in train_target ];
  [target_y_balanced, train_balanced]=load.split_equally(target_y,train_samples)
  print 'Training to tag %s from %d samples' %(each ,len(target_y_balanced))
  Y =np.array(target_y_balanced);

  if not LOGISTIC_REGRESSION:   
   X = bow.get_incremental_features(train_balanced);
  else:  
   X = bow_trimmed.get_incremental_features(train_balanced);
  assert(X.shape[0] ==  len(train_balanced))
  assert(Y.shape[0] == len(train_balanced))
  if not LOGISTIC_REGRESSION:
    clf = MultinomialNB(fit_prior=False);# onlu MultinomialNB takes sparse matrix , to offset hughe neg samples
  else:
    clf = LogisticRegression();
  #clf.fit(X,Y);
  pred = cross_validation.cross_val_predict(clf, X , Y, cv=3);